0% found this document useful (0 votes)
5 views38 pages

ml_code_output

Uploaded by

Anuja Mhetre
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views38 pages

ml_code_output

Uploaded by

Anuja Mhetre
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 38

Code:

#Importing the required libraries


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
#import dependencies

import pandas as pd
import numpy as np
#import plotly.express as px

#load the data into a dataframe


df = pd.read_csv('uber.csv')
#df1 = pd.read_csv('uber.csv')

#check the first 5 rows


df.head()
#print Dataset
print("Original Dataset")
print(df)

#Data Preprocessing

#drop the unnecessary columns


#df = df.drop(columns=(['Unnamed: 0', 'key','pickup_datetime','pickup_longitude',
'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']))
df = df.drop(['Unnamed: 0', 'key'], axis= 1) #To drop unnamed column as it isn't required
print("Dataset after dropping the unnecessary columns")
print(df)
print(df.dtypes) #To get the type of each column
print(df.shape) #To get the total (Rows,Columns)
print(df.describe()) #To get statistics of each columns
# Filling Missing Values
df.isnull().sum()

df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace = True)
df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace = True)
df.isnull().sum()
print(df.dtypes)
#Column pickup_datetime is in wrong format (Object). Convert it to DateTime Format
df.pickup_datetime = pd.to_datetime(df.pickup_datetime, errors='coerce')
print(df.dtypes)
df= df.assign(hour = df.pickup_datetime.dt.hour,
day= df.pickup_datetime.dt.day,
month = df.pickup_datetime.dt.month,
year = df.pickup_datetime.dt.year,
dayofweek = df.pickup_datetime.dt.dayofweek)
print(df)
# drop the column 'pickup_daetime' using drop()
# 'axis = 1' drops the specified column

df = df.drop('pickup_datetime',axis=1)
print(df)
#Checking outliers and filling them
#plt.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20)) #Boxplot to check the
outliers

#Using the InterQuartile Range to fill the values


def remove_outlier(df1 , col):
Q1 = df1[col].quantile(0.25)
Q3 = df1[col].quantile(0.75)
IQR = Q3 - Q1
lower_whisker = Q1-1.5*IQR
upper_whisker = Q3+1.5*IQR
df[col] = np.clip(df1[col] , lower_whisker , upper_whisker)
return df1

def treat_outliers_all(df1 , col_list):


for c in col_list:
df1 = remove_outlier(df , c)
return df1

df = treat_outliers_all(df , df.iloc[: , 0::])


print("Outliers")
print(df)

#pip install haversine


import haversine as hs #Calculate the distance using Haversine to calculate the distance
between to points. Can't use Eucladian as it is for flat surface.
travel_dist = []
for pos in range(len(df['pickup_longitude'])):
long1,lati1,long2,lati2 =
[df['pickup_longitude'][pos],df['pickup_latitude'][pos],df['dropoff_longitude'][pos],df['dropof
f_latitude'][pos]]
loc1=(lati1,long1)
loc2=(lati2,long2)
c = hs.haversine(loc1,loc2)
travel_dist.append(c)

#print(travel_dist)
df['dist_travel_km'] = travel_dist
print("Distance Calculated")
print(df)
#Function to find the correlation
corr = df.corr()
print("Correlation");
print(corr);

#Dividing the dataset into feature and target values

x=
df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_coun
t','hour','day','month','year','dayofweek','dist_travel_km']]
y = df['fare_amount']

#Dividing the dataset into training and testing dataset

from sklearn.model_selection import train_test_split


X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.33)

#Linear Regression
from sklearn.linear_model import LinearRegression
regression = LinearRegression()

regression.fit(X_train,y_train)
#To find the linear intercept
print("#To find the linear intercept");
print(regression.intercept_)

#To find the linear coeeficient


print("#To find the linear coeeficient");
print(regression.coef_);

#To predict the target values


prediction = regression.predict(X_test)
print("Prediction")
print(prediction)

print("Y Test ")


print(y_test)

#Metrics Evaluation using R2, Mean Squared Error, Root Mean Sqared Error

from sklearn.metrics import r2_score


print("R2 Score")
print(r2_score(y_test,prediction))
print("MSE")
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test,prediction)
print(MSE)
print("RMSE")
RMSE = np.sqrt(MSE)
print(RMSE)

#Random Forest Regression

from sklearn.ensemble import RandomForestRegressor


#Here n_estimators means number of trees you want to build before making the prediction
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print(y_pred)

#Metrics evaluatin for Random Forest


R2_Random = r2_score(y_test,y_pred)
print("R2 Random")
print(R2_Random)
print("MSE_Random")
MSE_Random = mean_squared_error(y_test,y_pred)
print(MSE_Random)

print("RMSE")

RMSE_Random = np.sqrt(MSE_Random)

print(RMSE_Random)
Output:
Code:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics

df=pd.read_csv('emails.csv')

print(df)
df.columns
df.isnull().sum()
df.dropna(inplace = True)
df.drop(['Email No.'],axis=1,inplace=True)
X = df.drop(['Prediction'],axis = 1)
y = df['Prediction']

from sklearn.preprocessing import scale


X = scale(X)
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

#KNN classifier

from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("Prediction",y_pred)
print("KNN accuracy = ",metrics.accuracy_score(y_test,y_pred))

print("Confusion matrix",metrics.confusion_matrix(y_test,y_pred))

#SVM classifier

model = SVC(C = 1)

# fit
model.fit(X_train, y_train)

# predict
y_pred = model.predict(X_test)

metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)

print("SVM accuracy = ",metrics.accuracy_score(y_test,y_pred))


Output:
Code:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt #Importing the libraries

# %% [code]
df = pd.read_csv("Churn_Modelling.csv")

# %% [markdown]
# Preprocessing.

# %% [code]
df.head()
df.shape
df.describe()
df.isnull()
df.isnull().sum()
df.info()
df.dtypes
df.columns
df = df.drop(['RowNumber', 'Surname', 'CustomerId'], axis= 1)

df.head()
def visualization(x, y, xlabel):
plt.figure(figsize=(10,5))
plt.hist([x, y], color=['red', 'green'], label = ['exit','not_exit'])
plt.xlabel(xlabel,fontsize=20)
plt.ylabel("No. of customers", fontsize=20)
plt.legend()
df_churn_exited = df[df['Exited']==1]['Tenure']
df_churn_not_exited = df[df['Exited']==0]['Tenure']

visualization(df_churn_exited, df_churn_not_exited, "Tenure")


df_churn_exited2 = df[df['Exited']==1]['Age']
df_churn_not_exited2 = df[df['Exited']==0]['Age']
visualization(df_churn_exited2, df_churn_not_exited2, "Age")

X=
df[['CreditScore','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMe
mber','EstimatedSalary']]
states = pd.get_dummies(df['Geography'],drop_first = True)
gender = pd.get_dummies(df['Gender'],drop_first = True)

df = pd.concat([df,gender,states], axis = 1)
df.head()
X=
df[['CreditScore','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','Est
imatedSalary','Male','Germany','Spain']]

y = df['Exited']

from sklearn.model_selection import train_test_split


X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.30)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train
import keras
from keras.models import Sequential
from keras.layers import Dense
classifier = Sequential()
classifier.add(Dense(activation = "relu",input_dim = 11,units = 6,kernel_initializer =
"uniform"))
classifier.add(Dense(activation = "relu",units = 6,kernel_initializer = "uniform"))
classifier.add(Dense(activation = "sigmoid",units = 1,kernel_initializer = "uniform"))
classifier.compile(optimizer="adam",loss = 'binary_crossentropy',metrics = ['accuracy'])
classifier.summary()
classifier.fit(X_train,y_train,batch_size=10,epochs=50) #Fitting the ANN to training dataset
y_pred =classifier.predict(X_test)
y_pred = (y_pred > 0.5) #Predicti
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
cm = confusion_matrix(y_test,y_pred)
cm
accuracy = accuracy_score(y_test,y_pred)
accuracy
plt.figure(figsize = (10,7))
sns.heatmap(cm,annot = True)
plt.xlabel('Predicted')
plt.ylabel('Truth')
print(classification_report(y_test,y_pred))
Output:
Code:

cur_x = 2 # The algorithm starts at x=3


rate = 0.01 # Learning rate
precision = 0.000001 #This tells us when to stop the algorithm
previous_step_size = 1 #
max_iters = 10000 # maximum number of iterations
iters = 0 #iteration counter
df = lambda x: 2*(x+3) #Gradient of our function
while previous_step_size > precision and iters < max_iters:
prev_x = cur_x #Store current x value in prev_x
cur_x = cur_x - rate * df(prev_x) #Grad descent
previous_step_size = abs(cur_x - prev_x) #Change in x
iters = iters+1 #iteration count
print("Iteration",iters,"\nX value is",cur_x) #Print iterations

print("The local minimum occurs at", cur_x)


Output:
Code:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics

df=pd.read_csv('diabetes.csv')
df.columns
df.isnull().sum()

X = df.drop('Outcome',axis = 1)
y = df['Outcome']

from sklearn.preprocessing import scale


X = scale(X)
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("Confusion matrix: ")


cs = metrics.confusion_matrix(y_test,y_pred)
print(cs)

print("Acccuracy ",metrics.accuracy_score(y_test,y_pred))

total_misclassified = cs[0,1] + cs[1,0]


print(total_misclassified)
total_examples = cs[0,0]+cs[0,1]+cs[1,0]+cs[1,1]
print(total_examples)
print("Error rate",total_misclassified/total_examples)
print("Error rate ",1-metrics.accuracy_score(y_test,y_pred))
print("Precision score",metrics.precision_score(y_test,y_pred))
print("Recall score ",metrics.recall_score(y_test,y_pred))
print("Classification report ",metrics.classification_report(y_test,y_pred))
Output:
Code:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#Importing the required libraries.
from sklearn.cluster import KMeans, k_means #For clustering
from sklearn.decomposition import PCA #Linear Dimensionality reduction.
df = pd.read_csv("sales_data_sample.csv",encoding= 'unicode_escape') #Loading the dataset.
print(df)
df_drop = ['ADDRESSLINE1', 'ADDRESSLINE2', 'STATUS','POSTALCODE', 'CITY',
'TERRITORY', 'PHONE', 'STATE', 'CONTACTFIRSTNAME', 'CONTACTLASTNAME',
'CUSTOMERNAME', 'ORDERNUMBER']
df = df.drop(df_drop, axis=1) #Dropping the categorical uneccessary columns along with
columns having null values. Can't fill the null values are there are alot of null values.
print(df)
# Checking the categorical columns.
df['COUNTRY'].unique()
df['PRODUCTLINE'].unique()
df['DEALSIZE'].unique()
productline = pd.get_dummies(df['PRODUCTLINE']) #Converting the categorical columns.
Dealsize = pd.get_dummies(df['DEALSIZE'])
df = pd.concat([df,productline,Dealsize], axis = 1)
df_drop = ['COUNTRY','PRODUCTLINE','DEALSIZE'] #Dropping Country too as there
are alot of countries.
df = df.drop(df_drop, axis=1)
df['PRODUCTCODE'] = pd.Categorical(df['PRODUCTCODE']).codes #Converting the
datatype.
df.drop('ORDERDATE', axis=1, inplace=True) #Dropping the Orderdate as Month is already
included.
df.dtypes #All the datatypes are converted into numeric
distortions = [] # Within Cluster Sum of Squares from the centroid

#Plotting the Elbow Plot to determine the number of clusters

K = range(1,10)
for k in K:
kmeanModel = KMeans(n_clusters=k)
kmeanModel.fit(df)
distortions.append(kmeanModel.inertia_) #Appeding the intertia to the Distortions
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

#As the number of k increases Inertia decreases.


#Observations: A Elbow can be observed at 3 and after that the curve decreases gradually.

X_train = df.values #Returns a numpy array.


X_train.shape
model = KMeans(n_clusters=3,random_state=2) #Number of cluster = 3
model = model.fit(X_train) #Fitting the values to create a model.
predictions = model.predict(X_train) #Predicting the cluster values (0,1,or 2)
unique,counts = np.unique(predictions,return_counts=True)
counts = counts.reshape(1,3)
counts_df = pd.DataFrame(counts,columns=['Cluster1','Cluster2','Cluster3'])
print(counts_df)

pca = PCA(n_components=2) #Converting all the features into 2 columns to make it easy to
visualize using Principal COmponent Analysis.
reduced_X = pd.DataFrame(pca.fit_transform(X_train),columns=['PCA1','PCA2']) #Creating
a DataFrame.
print(reduced_X)

#Plotting the normal Scatter Plot


plt.figure(figsize=(14,10))
plt.scatter(reduced_X['PCA1'],reduced_X['PCA2'])
plt.show()

model.cluster_centers_ #Finding the centriods. (3 Centriods in total. Each Array contains a


centroids for particular feature )

reduced_centers = pca.transform(model.cluster_centers_) #Transforming the centroids into 3


in x and y coordinates
reduced_centers

plt.figure(figsize=(14,10))
plt.scatter(reduced_X['PCA1'],reduced_X['PCA2'])
plt.scatter(reduced_centers[:,0],reduced_centers[:,1],color='black',marker='x',s=300)
#Plotting the centriods
plt.show()

reduced_X['Clusters'] = predictions #Adding the Clusters to the reduced dataframe.


reduced_X.head()

#Plotting the clusters


plt.figure(figsize=(14,10))
# taking the cluster number and first column taking the same cluster
number and second column Assigning the color
plt.scatter(reduced_X[reduced_X['Clusters'] ==
0].loc[:,'PCA1'],reduced_X[reduced_X['Clusters'] == 0].loc[:,'PCA2'],color='slateblue')
plt.scatter(reduced_X[reduced_X['Clusters'] ==
1].loc[:,'PCA1'],reduced_X[reduced_X['Clusters'] == 1].loc[:,'PCA2'],color='springgreen')
plt.scatter(reduced_X[reduced_X['Clusters'] ==
2].loc[:,'PCA1'],reduced_X[reduced_X['Clusters'] == 2].loc[:,'PCA2'],color='indigo')

plt.scatter(reduced_centers[:,0],reduced_centers[:,1],color='black',marker='x',s=300)

plt.show()
Output:

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy