ml_code_output
ml_code_output
import pandas as pd
import numpy as np
#import plotly.express as px
#Data Preprocessing
df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace = True)
df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace = True)
df.isnull().sum()
print(df.dtypes)
#Column pickup_datetime is in wrong format (Object). Convert it to DateTime Format
df.pickup_datetime = pd.to_datetime(df.pickup_datetime, errors='coerce')
print(df.dtypes)
df= df.assign(hour = df.pickup_datetime.dt.hour,
day= df.pickup_datetime.dt.day,
month = df.pickup_datetime.dt.month,
year = df.pickup_datetime.dt.year,
dayofweek = df.pickup_datetime.dt.dayofweek)
print(df)
# drop the column 'pickup_daetime' using drop()
# 'axis = 1' drops the specified column
df = df.drop('pickup_datetime',axis=1)
print(df)
#Checking outliers and filling them
#plt.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20)) #Boxplot to check the
outliers
#print(travel_dist)
df['dist_travel_km'] = travel_dist
print("Distance Calculated")
print(df)
#Function to find the correlation
corr = df.corr()
print("Correlation");
print(corr);
x=
df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_coun
t','hour','day','month','year','dayofweek','dist_travel_km']]
y = df['fare_amount']
#Linear Regression
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X_train,y_train)
#To find the linear intercept
print("#To find the linear intercept");
print(regression.intercept_)
#Metrics Evaluation using R2, Mean Squared Error, Root Mean Sqared Error
print("RMSE")
RMSE_Random = np.sqrt(MSE_Random)
print(RMSE_Random)
Output:
Code:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics
df=pd.read_csv('emails.csv')
print(df)
df.columns
df.isnull().sum()
df.dropna(inplace = True)
df.drop(['Email No.'],axis=1,inplace=True)
X = df.drop(['Prediction'],axis = 1)
y = df['Prediction']
#KNN classifier
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Prediction",y_pred)
print("KNN accuracy = ",metrics.accuracy_score(y_test,y_pred))
print("Confusion matrix",metrics.confusion_matrix(y_test,y_pred))
#SVM classifier
model = SVC(C = 1)
# fit
model.fit(X_train, y_train)
# predict
y_pred = model.predict(X_test)
metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)
# %% [code]
df = pd.read_csv("Churn_Modelling.csv")
# %% [markdown]
# Preprocessing.
# %% [code]
df.head()
df.shape
df.describe()
df.isnull()
df.isnull().sum()
df.info()
df.dtypes
df.columns
df = df.drop(['RowNumber', 'Surname', 'CustomerId'], axis= 1)
df.head()
def visualization(x, y, xlabel):
plt.figure(figsize=(10,5))
plt.hist([x, y], color=['red', 'green'], label = ['exit','not_exit'])
plt.xlabel(xlabel,fontsize=20)
plt.ylabel("No. of customers", fontsize=20)
plt.legend()
df_churn_exited = df[df['Exited']==1]['Tenure']
df_churn_not_exited = df[df['Exited']==0]['Tenure']
X=
df[['CreditScore','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMe
mber','EstimatedSalary']]
states = pd.get_dummies(df['Geography'],drop_first = True)
gender = pd.get_dummies(df['Gender'],drop_first = True)
df = pd.concat([df,gender,states], axis = 1)
df.head()
X=
df[['CreditScore','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','Est
imatedSalary','Male','Germany','Spain']]
y = df['Exited']
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics
df=pd.read_csv('diabetes.csv')
df.columns
df.isnull().sum()
X = df.drop('Outcome',axis = 1)
y = df['Outcome']
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Acccuracy ",metrics.accuracy_score(y_test,y_pred))
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#Importing the required libraries.
from sklearn.cluster import KMeans, k_means #For clustering
from sklearn.decomposition import PCA #Linear Dimensionality reduction.
df = pd.read_csv("sales_data_sample.csv",encoding= 'unicode_escape') #Loading the dataset.
print(df)
df_drop = ['ADDRESSLINE1', 'ADDRESSLINE2', 'STATUS','POSTALCODE', 'CITY',
'TERRITORY', 'PHONE', 'STATE', 'CONTACTFIRSTNAME', 'CONTACTLASTNAME',
'CUSTOMERNAME', 'ORDERNUMBER']
df = df.drop(df_drop, axis=1) #Dropping the categorical uneccessary columns along with
columns having null values. Can't fill the null values are there are alot of null values.
print(df)
# Checking the categorical columns.
df['COUNTRY'].unique()
df['PRODUCTLINE'].unique()
df['DEALSIZE'].unique()
productline = pd.get_dummies(df['PRODUCTLINE']) #Converting the categorical columns.
Dealsize = pd.get_dummies(df['DEALSIZE'])
df = pd.concat([df,productline,Dealsize], axis = 1)
df_drop = ['COUNTRY','PRODUCTLINE','DEALSIZE'] #Dropping Country too as there
are alot of countries.
df = df.drop(df_drop, axis=1)
df['PRODUCTCODE'] = pd.Categorical(df['PRODUCTCODE']).codes #Converting the
datatype.
df.drop('ORDERDATE', axis=1, inplace=True) #Dropping the Orderdate as Month is already
included.
df.dtypes #All the datatypes are converted into numeric
distortions = [] # Within Cluster Sum of Squares from the centroid
K = range(1,10)
for k in K:
kmeanModel = KMeans(n_clusters=k)
kmeanModel.fit(df)
distortions.append(kmeanModel.inertia_) #Appeding the intertia to the Distortions
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
pca = PCA(n_components=2) #Converting all the features into 2 columns to make it easy to
visualize using Principal COmponent Analysis.
reduced_X = pd.DataFrame(pca.fit_transform(X_train),columns=['PCA1','PCA2']) #Creating
a DataFrame.
print(reduced_X)
plt.figure(figsize=(14,10))
plt.scatter(reduced_X['PCA1'],reduced_X['PCA2'])
plt.scatter(reduced_centers[:,0],reduced_centers[:,1],color='black',marker='x',s=300)
#Plotting the centriods
plt.show()
plt.scatter(reduced_centers[:,0],reduced_centers[:,1],color='black',marker='x',s=300)
plt.show()
Output: