air-quality-randomforest
air-quality-randomforest
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/air-quality-and-pollution-assessment/pollution_dataset.csv
Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
Data Upload
df=pd.read_csv('/kaggle/input/air-quality-and-pollution-assessment/pollution_dataset.csv').dropna()
df.head(3)
Temperature Humidity PM2.5 PM10 NO2 SO2 CO Proximity_to_Industrial_Areas Population_Density Air Quality
0 27.2 51.7 35.1 46.2 26.7 32.2 0.98 11.2 314 Hazardous
1 26.3 59.3 1.0 6.2 38.3 20.4 0.68 13.5 298 Good
2 27.9 73.2 20.0 39.4 19.6 5.8 0.95 5.4 309 Good
Data Preprocessing
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Temperature 5000 non-null float64
1 Humidity 5000 non-null float64
2 PM2.5 5000 non-null float64
3 PM10 5000 non-null float64
4 NO2 5000 non-null float64
5 SO2 5000 non-null float64
6 CO 5000 non-null float64
7 Proximity_to_Industrial_Areas 5000 non-null float64
8 Population_Density 5000 non-null int64
9 Air Quality 5000 non-null object
dtypes: float64(8), int64(1), object(1)
memory usage: 390.8+ KB
df['Air Quality'].unique()
ENCODİNG
custom_mapping = {'Hazardous': 0, 'Poor': 1, 'Moderate': 2, 'Good': 3}
df['Air Quality'] = df['Air Quality'].map(custom_mapping)
print("Updated DataFrame:")
print(df)
Updated DataFrame:
Temperature Humidity PM2.5 PM10 NO2 SO2 CO \
0 27.2 51.7 35.1 46.2 26.7 32.2 0.98
1 26.3 59.3 1.0 6.2 38.3 20.4 0.68
2 27.9 73.2 20.0 39.4 19.6 5.8 0.95
3 23.9 51.9 14.7 24.3 5.2 12.6 1.24
4 25.2 59.0 26.3 30.9 26.8 13.5 1.06
... ... ... ... ... ... ... ...
4995 29.3 36.8 80.3 90.9 9.2 14.1 0.97
4996 15.7 51.7 0.7 11.4 40.5 13.8 1.07
4997 27.8 48.1 8.9 16.4 8.6 17.7 0.54
4998 30.4 50.4 2.2 18.8 13.1 22.3 0.94
4999 21.5 76.5 45.0 58.0 37.9 0.0 0.96
#Hazardous -> 0
#Poor -> 1
#Moderate -> 2
#Good -> 3
df.head(3)
Temperature Humidity PM2.5 PM10 NO2 SO2 CO Proximity_to_Industrial_Areas Population_Density Air Quality
Train-Test
y=df["Air Quality"]
x=df.drop("Air Quality", axis=1)
scaler = StandardScaler()
Standard Scaler
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
df.head(3)
Temperature Humidity PM2.5 PM10 NO2 SO2 CO Proximity_to_Industrial_Areas Population_Density Air Quality
Random Forest
clf = RandomForestClassifier(random_state=42)
clf.fit(x_train_scaled, y_train)
▾ RandomForestClassifier
RandomForestClassifier(random_state=42)
Predict
y_pred = clf.predict(x_test_scaled)
print(classification_report(y_test, y_pred))
Accuracy: 0.37
Classification Report:
precision recall f1-score support
Data Visualization
import warnings
warnings.filterwarnings('ignore')