Train
Train
Train
Load Libraries
In [1]:
import pandas as pd
Load Dataset
In [2]:
df = pd.read_csv('./tennis.csv')
Explore Dataset
In [3]:
df.head()
Out[3]:
tem
outlook humidity windy play
p
In [4]:
df.shape
Out[4]:
(14, 5)
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 outlook 14 non-null object
1 temp 14 non-null object
2 humidity 14 non-null object
3 windy 14 non-null bool
4 play 14 non-null object
dtypes: bool(1), object(4)
memory usage: 590.0+ bytes
In [16]:
for i in df.columns:
print(f'{i} : {df[i].unique()}')
outlook : ['sunny' 'overcast' 'rainy']
temp : ['hot' 'mild' 'cool']
humidity : ['high' 'normal']
windy : [False True]
play : ['no' 'yes']
Q2)
You are given a dataset Housing.csv, which contains information about various features of houses and
their corresponding prices. The goal is to predict the house prices based on the available features
using linear regression.
Code :
import pandas as pd
import numpy as np
df = pd.read_csv('Housing.csv')
# Check for missing values
print(df.isnull().sum())
# Fill missing values for numerical columns with mean and categorical columns with mode
df.fillna(df.mean(), inplace=True)
df['Column_name'] = df['Column_name'].fillna(df['Column_name'].mode()[0])
X = df_encoded.drop(columns=['Price'])
y = df_encoded['Price']
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.show()
# Residuals plot
sns.scatterplot(x=y_pred, y=residuals)
plt.xlabel("Predicted Prices")
plt.ylabel("Residuals")
plt.show()
Q: Design a task where you acquire two distinct types of datasets: one comprising numerical data and
the other categorical data. Subsequently, you will perform Linear Regression on the dataset containing
numerical values, and Logistic Regression on the dataset containing categorical values.
import pandas as pd
import numpy as np
data_numerical = {
'Size': [1200, 1500, 1800, 2000, 1600, 1100, 2500, 2200, 2300, 1400],
'Price': [400000, 500000, 450000, 350000, 475000, 320000, 600000, 550000, 580000, 330000]
df_numerical = pd.DataFrame(data_numerical)
y_numerical = df_numerical['Price']
# Split the data into training and testing sets for linear regression
linear_model = LinearRegression()
linear_model.fit(X_train_num, y_train_num)
y_pred_num = linear_model.predict(X_test_num)
data_categorical = {
'Age_Group': ['18-25', '26-35', '36-45', '46-60', '18-25', '26-35', '36-45', '46-60', '18-25', '26-35'],
'Gender': ['Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Female', 'Female', 'Male'],
'Product_Category': ['A', 'B', 'A', 'B', 'B', 'A', 'B', 'A', 'B', 'A'],
}
df_categorical = pd.DataFrame(data_categorical)
X_categorical = df_categorical_encoded.drop(columns=['Purchased'])
y_categorical = df_categorical_encoded['Purchased']
# Split the data into training and testing sets for logistic regression
logistic_model = LogisticRegression()
logistic_model.fit(X_train_cat, y_train_cat)
y_pred_cat = logistic_model.predict(X_test_cat)
print(f"R-squared: {r2_num}")
print(f"Accuracy: {accuracy_cat}")
print(f"Confusion Matrix:\n{conf_matrix_cat}")
question 4
You decide to use the algorithm for classification.The dataset is split into 80%
training data and 20% test data.
1. Load the dataset and preprocess it (handle missing values, normalize the
data if necessary).
2. Implement the KNN algorithm to classify.
3. Evaluate the model's performance by calculating the and displaying the .
import pandas as pd
import numpy as np
# Load the dataset (replace 'your_dataset.csv' with the actual path to your dataset)
data = {
df = pd.DataFrame(data)
print("Original Dataset:")
print(df)
imputer = SimpleImputer(strategy='mean') # Fill missing values with the mean of the column
df_imputed = pd.DataFrame(imputer.fit_transform(df.drop(columns=['Target'])))
df_imputed['Target'] = df['Target'] # Add target column back
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_imputed.drop(columns=['Target']))
y = df_imputed['Target']
# Step 2: Split the dataset into training and testing sets (80% training, 20% testing)
# Initialize the KNN classifier (let's use k=3 for this example)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
# Calculate accuracy
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
Question 2
Your task is to take an unclean dataset and drop the unnecessary columns from it. Then, check
the remaining columns to see if there are any NaN (missing) values, and if there are, fill those
values. After that, apply One-hot encoding to the categorical values. Finally, calculate the mean
of the columns you are using.
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
# For this demonstration, we'll create a synthetic dataset with some NaN values and categorical data.
data = {
# Create DataFrame
df = pd.DataFrame(data)
print("Original Dataset:")
print(df)
df_cleaned = df.drop(columns=['UnnecessaryFeature'])
print(df_cleaned.isna().sum())
df_cleaned = df_cleaned.fillna(df_cleaned.mean())
print(df_cleaned.isna().sum())
print(df_encoded)
mean_values = df_encoded.mean()
print("\nMean of Columns:")
print(mean_values)
output :
Original Dataset:
0 1.0 5.0 A 10 X
1 2.0 NaN B 20 Y
2 NaN 3.0 A 30 Z
3 4.0 2.0 B 40 X
4 5.0 1.0 A 50 Y
Feature1 1
Feature2 1
Feature3 0
Feature4 0
UnnecessaryFeature 0
dtype: int64
Feature1 0
Feature2 0
Feature3 0
Feature4 0
dtype: int64
0 1.0 5.0 10 0
1 2.0 2.0 20 1
2 3.0 3.0 30 0
3 4.0 2.0 40 1
4 5.0 1.0 50 0
Mean of Columns:
Feature1 3.000000
Feature2 2.750000
Feature4 30.000000
Feature3_B 0.400000
dtype: float64
OR
import pandas as pd
file_path = '/mnt/data/laptopData.csv'
data = pd.read_csv(file_path)
# Fill missing values: Numerical columns will be filled with their mean; categorical with the mode.
data_cleaned[column].fillna(data_cleaned[column].mode()[0], inplace=True)
data_cleaned[column].fillna(data_cleaned[column].mean(), inplace=True)
categorical_columns = data_cleaned.select_dtypes(include=['object']).columns
column_means = data_encoded.mean()
# Display the results
print("Column Means:")
print(column_means)