Cse Machine Learning Lab Manual
Cse Machine Learning Lab Manual
ANDREWS INSTITUTE
OF TECHNOLOGY & MANAGEMENT
Bachelor of Technology
A Practical File
Subject Code-LC-CSE-412G
ROLL NO.:
St. Andrews Institute of Technology &
Management, Gurugram
Department of……………………………
Average Marks
(Faculty Sign.)
PROGRAM NO-1
➢ The probability that it is Friday and that a Student is absent is 3%.Since there are 5
school days in a week, the probability that it is Friday is 20%. What is the
probability that a student is absent given that today is Friday? Apply Baye’s rule in
python to get the result.
SOURCE CODE
probAbsentFriday=0.0 3
probFriday=0.2
# bayes Formula
#p(Absent|Friday)=p(Friday|Absent)p(Absent)/p(Friday)
#p(Friday|Absent)=p(Friday∩Absent)/p(Absent) # Therefore the result
is:bayesResult=(probAbsentFriday/probFriday) print(bayesResult * 100)
OUTPUT:
15
PROGRAM NO-2
➢ Implement and demonstrate the FIND-S algorithm for finding the most specific
hypothesis based on a given set of training data samples. Read the training data
from a .CSV file.
1. FIND-S Algorithm 1
Initialize h to the most specific hypothesis in H
3. Output hypothesis h
Training Examples:
SOURCE CODE:
OUTPUT:
PROGRAM NO:3
➢ For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Candidate Elimination algorithm to output a description of the set
of all hypotheses consistent with the training examples.
SOURCE CODE:
import csv
with open("trainingdata.csv") as f:
csv_file=csv.reader(f)
data=list(csv_file)
s=data[1][:-1]
g=[['?' for i in range(len(s))] for j in range(len(s))]
for i in data:
if i[-1]=="Yes":
for j in range(len(s)):
if i[j]!=s[j]:
s[j]='?'
g[j][j]='?'
elif i[-1]=="No":
for j in range(len(s)):
if i[j]!=s[j]:
g[j][j]=s[j]
else:
g[j][j]="?"
print("\nSteps of Candidate Elimination Algorithm",data.index(i)+1)
print(s)
print(g)
gh=[]
for i in g:
for j in i:
if j!='?':
gh.append(i)
break
print("\nFinal specific hypothesis:\n",s)
print("\nFinal general hypothesis:\n",gh)
OUTPUT:
PROGRAM NO:4
➢ Write a program to demonstrate the working of the decision tree based ID3
algorithm. Use an appropriate data set for building the decision tree and apply this
knowledge to classify a new sample.
SOURCE CODE:
import pandas as pd
import math
import numpy as np
data = pd.read_csv("3-dataset.csv")
features = [feat for feat in data]
features.remove("answer")
class Node:
def init (self):
self.children = []
self.value = ""
self.isLeaf = False
self.pred = ""
def entropy(examples):
pos = 0.0
neg = 0.0
for _, row in examples.iterrows():
if row["answer"] == "yes":
pos += 1
else:
neg += 1
if pos == 0.0 or neg == 0.0:
return 0.0
else:
p = pos / (pos + neg)
n = neg / (pos + neg)
return -(p * math.log(p, 2) + n * math.log(n, 2))
def info_gain(examples, attr):
uniq = np.unique(examples[attr])
#print ("\n",uniq)
gain = entropy(examples)
#print ("\n",gain)
for u in uniq:
subdata = examples[examples[attr] == u]
#print ("\n",subdata)
sub_e = entropy(subdata)
gain -= (float(len(subdata)) / float(len(examples))) * sub_e
#print ("\n",gain)
return gain
def ID3(examples, attrs):
root = Node()
max_gain = 0
max_feat = ""
for feature in attrs:
#print ("\n",examples)
gain = info_gain(examples, feature)
if gain > max_gain:
max_gain = gain
max_feat = feature
root.value = max_feat
#print ("\nMax feature attr",max_feat)
uniq = np.unique(examples[max_feat])
#print ("\n",uniq)
for u in uniq:
#print ("\n",u)
subdata = examples[examples[max_feat] == u]
#print ("\n",subdata)
if entropy(subdata) == 0.0:
newNode = Node()
newNode.isLeaf = True
newNode.value = u
newNode.pred = np.unique(subdata["answer"])
root.children.append(newNode)
else:
dummyNode = Node()
dummyNode.value = u
new_attrs = attrs.copy()
new_attrs.remove(max_feat)
child = ID3(subdata, new_attrs)
dummyNode.children.append(child)
root.children.append(dummyNode)
return root
def printTree(root: Node, depth=0):
for i in range(depth):
print("\t", end="")
print(root.value, end="")
if root.isLeaf:
print(" -> ", root.pred)
print()
for child in root.children:
printTree(child, depth + 1)
root = ID3(data, features)
printTree(root)
OUTPUT:
PROGRAM NO:5
➢ Assuming a set of documents that need to be classified, use the naïve Bayesian
Classifier model to performthis task. Built-in Java classes/API can be used to write
the program. Calculate the accuracy, precision, and recall for your data set.
SOURCE CODE:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
msg=pd.read_csv('naivetext.csv',names=['message','label'])
print('The dimensions of the dataset',msg.shape)
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
y=msg.labelnum
#splitting the dataset into train and test data
xtrain,xtest,ytrain,ytest=train_test_split(X,y)
print ('\n the total number of Training Data :',ytrain.shape)
print ('\n the total number of Test Data :',ytest.shape)
#output the words or Tokens in the text documents
cv = CountVectorizer()
xtrain_dtm = cv.fit_transform(xtrain)
xtest_dtm=cv.transform(xtest)
print('\n The words or Tokens in the text documents \n')
print(cv.get_feature_names())
df=pd.DataFrame(xtrain_dtm.toarray(),columns=cv.get_feature_names())
# Training Naive Bayes (NB) classifier on training data.
clf = MultinomialNB().fit(xtrain_dtm,ytrain)
predicted = clf.predict(xtest_dtm)
#printing accuracy, Confusion matrix, Precision and Recall
print('\n Accuracy of the classifier is',metrics.accuracy_score(ytest,predicted))
print('\n Confusion matrix')
print(metrics.confusion_matrix(ytest,predicted))
print('\n The value of Precision', metrics.precision_score(ytest,predicted))
print('\n The value of Recall', metrics.recall_score(ytest,predicted))
OUTPUT:
PROGRAM NO:6
The model representation for KNN is the entire training dataset.It is as simple as that.
KNN has no model other than storing the entire dataset, so there is no learning required.
Efficient implementations can store the data using complex data structures like k-d trees
to make look-up and matching of new patterns during prediction efficient. Because the
entire training dataset is stored, you may want to think carefully about the consistency
of your training data. It might be a good idea to curate it, update it often as new data
becomes
available and remove erroneous and outlier data.
SOURCE CODE:
# Import necessarymodules
fromsklearn.neighbors
importKNeighborsClassifier
from sklearn.model_selection
import train_test_split
from sklearn.datasets
import load_iris
# Loading data irisData = load_iris()
# Create feature and target arrays
X = irisData.data
y = irisData.target
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, random_state=42)
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
OUTPUT:
PROGRAM NO:7
➢ Given the following data, which specify classifications for nine combination VAR1
and VAR2 predict a classification for a case where VAR1=0.906 and VAR2=0.606,
using the result of k-means clustering with 3 means (i.e., 3centroids)
SOURCE CODE:
kmeans.predict([[0.906, 0.606]])
PROGRAM NO: 8
SOURCE CODE:
importnumpy as np
importmatplotlib.pyplot as plt
defestimate_coef(x, y):
# number of observations/points
n =p.size(x)
# mean of x and y vector
m_x, m_y = np.mean(x), np.mean(y)
# calculating cross-deviation and deviation aboutx
SS_xy = np.sum(y*x) - n*m_y*m_x
SS_xx = np.sum(x*x) -n*m_x*m_x
# calculating regression coefficients
b_1 = SS_xy / SS_xx b_0 = m_y - b_1*m_x
return(b_0, b_1)
defplot_regression_line(x, y, b):
# plotting the actual points as scatter plot
plt.scatter(x, y, color = "m", marker = "o", s = 30)
# predicted response vector
y_pred = b[0] + b[1]*x
# plotting the regression line
plt.plot(x, y_pred, color = "g")
# putting labels
plt.xlabel('x') plt.ylabel('y')
# function to show plot
plt.show() def main():
# observations
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
# estimating coefficients
b = estimate_coef(x, y)
print("Estimated coefficients:\nb_0 = {} \ \nb_1 = {}".format(b[0], b[1]))
# plotting regression line
plot_regression_line(x, y, b) ifname == "main":
main()
OUTPUT:
Estimated coefficients:
b_0 = -0.05862068965
PROGRAM NO:9
SOURCE CODE:
import pandas as pd
fromsklearn.model_selection
import train_test_split
fromsklearn.feature_extraction.text
import CountVectorizer from sklearn.naive_bayes
import MultinomialNB fromsklearn.metrics
import accuracy_score, confusion_matrix, precision_score, recall_score
msg = pd.read_csv('document.csv', names=['message', 'label'])
print("Total Instances of Dataset: ",
msg.shape[0]) msg['labelnum'] = msg.label.map({'pos': 1, 'neg': 0})
X = msg.message
y = msg.labelnum
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)
count_v = CountVectorizer()
Xtrain_dm = count_v.fit_transform(Xtrain)
Xtest_dm = count_v.transform(Xtest) df =
pd.DataFrame(Xtrain_dm.toarray(),columns=count_v.get_feature_names())
clf = MultinomialNB()
clf.fit(Xtrain_dm, ytrain)
pred = clf.predict(Xtest_dm)
print('Accuracy Metrics:')
print('Accuracy: ', accuracy_score(ytest, pred))
print('Recall: ', recall_score(ytest, pred))
print('Precision: ', precision_score(ytest, pred))
print('Confusion Matrix: \n', confusion_matrix(ytest, pred))
OUTPUT:
PROGRAM NO: 10
SOURCE CODE:
# Making predictions
predictions = model.predict(X_test)
OUITPUT
Accuracy: 0.8524590163934426
Classification Report:
precision recall f1-score support
accuracy 0.85 61
macro avg 0.85 0.84 0.84 61
weighted avg 0.85 0.85 0.85 61