Sree Narayana Gurukulam: College of Engineering
Sree Narayana Gurukulam: College of Engineering
Sree Narayana Gurukulam: College of Engineering
COLLEGE OF ENGINEERING
KADAYIRUPPU, KOLENCHERY 682311
LABORATORY RECORD
YEAR---------TO
NAME---------------------------------------------------------------------------
SEMESTER-----------------------------------------ROLL NO.------------------------
BRANCH----------------------------------------
Kadayiruppu
Date:
Program : 1
Aim: To review fundamentals of Python programming for Data science and machine learning programming.
Here, we focus on concepts such as
Program
1a=5
2 print("Type of a: ", type(a)) 3
4 b = 5.0
5 print("\nType of b: ", type(b)) 6
7 c = 2 + 4j
8 print("\nType of c: ", type(c))
https://colab.research.google.com/drive/1FOlIXPiv93y86LuNu-zYtJZmU8BsNHlx#printMode=true 3/10
3/7/22, 7:51 PM Fundamentals.ipynb - Colaboratory
# with double Quotes
12 String1 = "I'm happy to program"
13 print("\nString with the use of Double Quotes: ") 14
print(String1)
15 print(type(String1))
16
17 # Creating a String 18
# with triple Quotes
19 String1 = '''I'm happy to learn "Python"'''
20 print("\nString with the use of Triple Quotes: ") 21
print(String1)
22 print(type(String1))
23
24 # Creating String with triple 25
# Quotes allows multiple lines 26
String1 = '''Programming
27 is
28 fun'''
29 print("\nCreating a multiline String: ") 30
print(String1)
31
32
33 # Python Program to Access 34 #
characters of String
35
36 String1 = "Programming" 37
print("Initial String: ") 38
print(String1)
39
40 # Printing First character
41 print("\nFirst character of String is: ") 42
print(String1[0])
43
44 # Printing Last character
45 print("\nLast character of String is: ") 46
print(String1[-1])
47
https://colab.research.google.com/drive/1FOlIXPiv93y86LuNu-zYtJZmU8BsNHlx#printMode=true 4/10
3/7/22, 7:51 PM Fundamentals.ipynb - Colaboratory
Multi-Dimensional List:
[['Python', 'Java'], ['C']]
13 print(Tuple1)
14
15 # Creating a Tuple with
16 # the use of list
https://colab.research.google.com/drive/1FOlIXPiv93y86LuNu-zYtJZmU8BsNHlx#printMode=true 5/10
3/7/22, 7:51 PM Fundamentals.ipynb - Colaboratory
17 list1 = [1, 2, 4, 5, 6]
18 print("\nTuple using List: ")
19 print(tuple(list1))
20
21 # Creating a Tuple with the
22 # use of built-in function
23 Tuple1 = tuple('Python')
24 print("\nTuple with the use of function: ")
25 print(Tuple1)
26
27 # Creating a Tuple
28 # with nested tuples
29 Tuple1 = (0, 1, 2, 3)
30 Tuple2 = ('data', 'science')
31 Tuple3 = (Tuple1, Tuple2)
32 print("\nTuple with nested tuples: ")
33 print(Tuple3)
34
35
36 # Python program to
37 # demonstrate accessing tuple
38
39 tuple1 = tuple([1, 2, 3, 4, 5])
40
41 # Accessing element using indexing
42 print("First element of tuple")
43 print(tuple1[0])
44
45 # Accessing element from last
46 # negative indexing
47 print("\nLast element of tuple")
48 print(tuple1[-1])
49
50 print("\nThird last element of tuple")
51 print(tuple1[-3])
52
1 # Python program to
2 # demonstrate boolean type
https://colab.research.google.com/drive/1FOlIXPiv93y86LuNu-zYtJZmU8BsNHlx#printMode=true 6/10
3/7/22, 7:51 PM Fundamentals.ipynb - Colaboratory
3
4 print(type(True))
5 print(type(False))
6
7 print(type(true)) #Error for case sensitive t in True 8
<class 'bool'>
<class 'bool'>
https://colab.research.google.com/drive/1FOlIXPiv93y86LuNu-zYtJZmU8BsNHlx#printMode=true 7/10
3/7/22, 7:51 PM Fundamentals.ipynb - Colaboratory
42 # Checking the element 43
# using in keyword
44 print("Java" in set1)
45
46
Initial set
{'Java', 'Python', 'Program'}
Elements of set:
Java Python Program True
34 # method
35 print("Accessing a element using get:") 36
print(Dict.get(3))
37
https://colab.research.google.com/drive/1FOlIXPiv93y86LuNu-zYtJZmU8BsNHlx#printMode=true 8/10
3/7/22, 7:51 PM Fundamentals.ipynb - Colaboratory
Empty Dictionary:
{}
1 def my_function():
2 print("Hello from a simple function") 3
4 my_function()
5
6 def my_function(fname):
7 print("Hello " + fname + " from a fuction with one argument") 8
9 my_function("Amy")
10 my_function("Sara")
11 my_function("Thomas")
12
13 #function with 2 arguments
14 def my_function(fname, lname):
15 print(fname + " " + lname) 16
17 my_function("Amy", "Joseph")
18
19 #function with arbritrary number of arguments
20 # If the number of arguments is unknown, add a * before the parameter name: 21 def
my_function(*kids):
22 print("The youngest child is " + kids[2])
23
24 my_function("Amy", "Sara", "Stephen") 25
26 #Arguments with the key = value syntax. 27
def my_function(child3, child2, child1):
28 print("The youngest child is " + child3)
29
30 my_function(child1 = "Amy", child2 = "Sara", child3 = "Stephen")
31
32 #Keyword Arguments
33 # If the number of keyword arguments is unknown, add a double ** before the parameter name 34
35 def my_function(**kid):
36 print("His last name is " + kid["lname"]) 37
38 my_function(fname = "Sara", lname = "Peter") 39
https://colab.research.google.com/drive/1FOlIXPiv93y86LuNu-zYtJZmU8BsNHlx#printMode=true 9/10
3/7/22, 7:51 PM Fundamentals.ipynb - Colaboratory
40 #Default parameter value
41 def my_function(country = "Norway"):
42 print("I am from " + country) 43
44 my_function("Sweden")
45 my_function("India")
46 my_function()
47 my_function("Brazil")
48
49 #List as argument
50
51 def my_function(food):
52 for x in food:
53 print(x)
54
55 fruits = ["apple", "banana", "cherry"] 56
57 my_function(fruits)
58
59 #return statement
60
61 def my_function(x):
62 return 5 * x
63
64 print(my_function(3))
65 print(my_function(5))
66 print(my_function(9))
67
68 #Pass statement
69 #function definitions cannot be empty, but if you for some reason have a function 70
#definition with no content, put in the pass statement to avoid getting an error 71 def
myfunction():
72 pass
73
74 #recursion Example
75
76 def tri_recursion(k):
77 if(k > 0):
78 result = k + tri_recursion(k - 1)
79 print(result)
80 else:
81 result = 0
82 return result
83
84 print("\n\nRecursion Example Results") 85
tri_recursion(6)
86
87
88 #Lambda function
89 #A lambda function is a small anonymous function.
90 #A lambda function can take any number of arguments, but can only have one expression. 91
92 #Example 1: Multiply argument a with argument b and return the result: 93 x =
lambda a, b : a * b
94 print(x(5, 6))
95
96 #Fuction within another fuction
97 #The power of lambda is better shown when you use them as an anonymous function 98 #inside
another function.
99
100 print("\n\n\n Function with function")
https://colab.research.google.com/drive/1FOlIXPiv93y86LuNu-zYtJZmU8BsNHlx#printMode=true 10/1
3/7/22, 7:51 PM Fundamentals.ipynb - Colaboratory
101 def myfunc(n):
102 return lambda a : a * n
103
104 mydoubler = myfunc(2)
105 mytripler = myfunc(3)
106
107 print("My doubler: %d" %mydoubler(11))
108 print("My Tripler: %d" %mytripler(11))
109
https://colab.research.google.com/drive/1FOlIXPiv93y86LuNu-zYtJZmU8BsNHlx#printMode=true 11/1
3/7/22, 7:59 PM SREEJITH T SHAJI CO1-DataVisualisation-
Hunting Exoplanets In Space - Scatter & Line
Program : 2
Aim : Programto handle data using pandas & perform data visualization using matplotlib & seaborn.
LABEL FLUX.1 FLUX.2 FLUX.3 FLUX.4 FLUX.5 FLUX.6 FLUX.7 FLUX.8 FLUX.9 FLUX.10 FLUX
0 2 93.85 83.81 20.10 -26.98 -39.56 -124.71 -135.18 -96.27 -79.89 -160.17 -20
1 2 -38.88 -33.83 -58.54 -40.09 -79.31 -72.81 -86.55 -85.33 -83.97 -73.38 -8
2 2 532.64 535.92 513.73 496.92 456.45 466.00 464.50 486.39 436.56 484.39 46
3 2 326.52 347.39 302.35 298.13 317.74 312.70 322.33 311.31 312.42 323.33 31
4 2 -1107.21 -1112.59 -1118.95 -1095.10 -1057.55 -1034.48 -998.34 -1022.71 -989.57 -970.88 -93
(2828, 3198)
LABEL 0
FLUX.1 0
FLUX.2 0
FLUX.3 0
FLUX.4 0
..
FLUX.3193 1
FLUX.3194 1
FLUX.3195 1
FLUX.3196 1
FLUX.3197 1
Length: 3198, dtype: int64
https://colab.research.google.com/drive/1eyoUgfgyvWBAugjRzvtlcMywvQ5CZYrp?usp=sharing 1
3/7/22, 7:59 PM SREEJITH T SHAJI CO1-DataVisualisation-
Hunting Exoplanets In Space - Scatter & Line
Syntax:
In this syntax:
row_position_start denotes the position of the row in the DataFrame starting from whose values you want to take in the new Pandas series or
DataFrame.
row_position_end denotes the position of the row in the DataFrame till whose values you want to take in the new Pandas series or DataFrame.
column_position_start denotes the position of the column in the DataFrame starting from whose values you want to take in the new
Pandas series or DataFrame.
column_position_end denotes the position of the column in the DataFrame till whose values you want to take in the new Pandas series or
DataFrame.
You can verify manually whether we have extracted the values from the first row or not by viewing the first 5 rows of the DataFrame using the head()
function.
1 # Create a Pandas series for the first star and store it in a variable called 'star_0'. 2 star_0 =
exo_train_df.iloc[0, :]
3 star_0.head()
LABEL 2.00
FLUX.1 93.85
FLUX.2 83.81
FLUX.3 20.10
FLUX.4 -26.98
Name: 0, dtype: float64
1 type(star_0)
pandas.core.series.Series
1 # Create a Pandas series for the second star and store it in a variable called 'star_1'. 2 star_1=
exo_train_df.iloc[1, :]
3 star_1.head()
LABEL 2.00
FLUX.1 -38.88
FLUX.2 -33.83
FLUX.3 -58.54
FLUX.4 -40.09
Name: 1, dtype: float64
1 # Create a Pandas series for the third star and store it in a variable called 'star_2'. 2 star_2=
exo_train_df.iloc[2, :]
3 star_2.head()
LABEL 2.00
FLUX.1 532.64
FLUX.2 535.92
FLUX.3 513.73
FLUX.4 496.92
Name: 2, dtype: float64
1 # Create a Pandas series for the last star and store it in a variable called 'star_5086'. 2
star_5086= exo_train_df.iloc[-1, :]
3 star_5086.head()
LABEL 1.00
FLUX.1 -63.94
FLUX.2 -78.34
FLUX.3 -87.04
FLUX.4 -58.34
Name: 2827, dtype: float64
https://colab.research.google.com/drive/1eyoUgfgyvWBAugjRzvtlcMywvQ5CZYrp?usp=sharing 1
3/7/22, 7:59 PM SREEJITH T SHAJI CO1-DataVisualisation-
Hunting Exoplanets In Space - Scatter & Line
Now plot the Flux values on the y − axis for each observation for a star. On x − axis, we will plot numbers ranging from 1 to 3197 .
2. Then we need to call the figure() function from the plt module to resize the plot. The figure() function takes figsize=
(horizontal_width, vertical_height) parameter as an input.
plt.figure(figsize=(16, 4))
3. Then we need either a Python list, a NumPy array or a Pandas series containing the numbers between 1 and 3197 to plot them on the
x − axis.
4. Then we need star_0 Pandas series to plot the FLUX values on the y − axis for the first star in the DataFrame.
y_values_star_0 = star_0[1:]
5. Then we need to call the scatter() function from the plt module with the required inputs as described in the third and the fourth steps.
plt.scatter(x_values_star_0, y_values_star_0)
6. Finally, we need to call the show() function from the plt module.
plt.show()
25 plt.show()
26 # The 'show()' function displays the plot.
https://colab.research.google.com/drive/1eyoUgfgyvWBAugjRzvtlcMywvQ5CZYrp?usp=sharing 1
3/7/22, 7:59 PM SREEJITH T SHAJI CO1-DataVisualisation-
Hunting Exoplanets In Space - Scatter & Line
[<matplotlib.lines.Line2D at 0x7f53f0d18f10>]
The line plot also confirms the periodic downward-peaks in the FLUX values.
<matplotlib.collections.PathCollection at 0x7f53f026a150>
It is quite dificult to spot any clear pattern in the scatter plot for the second star in the DataFrame. Let's draw a line plot to identify a pattern.
5 plt.plot(x_star1,y_star1)
6
7
https://colab.research.google.com/drive/1eyoUgfgyvWBAugjRzvtlcMywvQ5CZYrp?usp=sharing 1
3/7/22, 7:59 PM SREEJITH T SHAJI CO1-DataVisualisation-
[<matplotlib.lines.Line2D at 0x7f53f01f6350>] Hunting Exoplanets In Space - Scatter & Line
As we can see, there are consistent sudden drops in the brightness levels for the second star in the DataFrame. This suggests that the planet is
orbiting its star at very high radial speed. Also, the planet could be very close to the star.
<matplotlib.collections.PathCollection at 0x7f53f011bad0>
Here also, we can spot a clear repetitive downward-peaks which confirms that the star has at least one planet.
[<matplotlib.lines.Line2D at 0x7f53f0095650>]
https://colab.research.google.com/drive/1eyoUgfgyvWBAugjRzvtlcMywvQ5CZYrp?usp=sharing 1
3/7/22, 7:59 PM SREEJITH T SHAJI CO1-DataVisualisation-
Hunting Exoplanets In Space - Scatter & Line
1 # Create a scatter plot for the second-last star, i.e., 'star_5085' in the DataFrame. 2
star_5085= exo_train_df.iloc[-2, :]
3 plt.figure(figsize=(16,4))
4 x_star5085 = np.arange(1,3198) 5
y_star5085 = star_5085[1:]
6 plt.scatter(x_star5085,y_star5085)
7
<matplotlib.collections.PathCollection at 0x7f53f1b1bcd0>
There is no clear periodic downward-peak pattern in the FLUX values for the second-last star.
1 # Student Action: Create a line plot for the second-last star in the DataFrame. 2
plt.figure(figsize=(16,4))
3 x_star5085 = np.arange(1,3198) 4
y_star5085 = star_5085[1:]
5 plt.plot(x_star5085,y_star5085)
[<matplotlib.lines.Line2D at 0x7f53f1755c90>]
The line-plot also confirms that there is no clear periodic downward-peak pattern in the FLUX values.
https://colab.research.google.com/drive/1eyoUgfgyvWBAugjRzvtlcMywvQ5CZYrp?usp=sharing 1
CO2 KNN Classification SREEJITH T SHAJI
3/7/22, 8:01 PM
Program : 3
Aim: Program to implement k-NN classification using any standard dataset available in the public domain and
find the accuracy of the algorithm
Algorithm:
1. The distance between the unknown instance and all other training instances is computed.
2. The k nearest neighbors are identified.
3. The class labels of the k nearest neighbors are used to determine the class label of the unknown instance
by using techniques like majority voting.
accuracy 0.97 30
macro avg 0.97 0.97 0.97 30
weighted avg 0.97 0.97 0.97 30
https://colab.research.google.com/drive/1ApbBuLjwZsMcUOKwhBmcTpKBwwCAItKT?usp=sharing 2/
3/7/22, 8:03 PM CO2 Simple linear regression SREEJITH T SHAJI
422Colaboratory
Program : 4
Aim: Program to implement Naïve Bayes Algorithm using any standard dataset available in the public
domain and find the accuracy of the algorithm
Bayes’ Theorem provides a way that we can calculate the probability of a piece of data belonging to a given class,
given our prior knowledge. Bayes’ Theorem is stated as:
We are using Iris Dataset. The Iris Flower Dataset involves predicting the flower species given
measurements of iris flowers.
It is a multiclass classification problem. The number of observations for each class is balanced. There are 150
observations with 4 input variables and 1 output variable. The variable names are as follows:
Class.
Algorithm:
1 #Import Modules
2 import numpy as np
3 import matplotlib.pyplot as plt
4 from sklearn import neighbors, datasets, preprocessing 5
from sklearn.model_selection import train_test_split 6 from
sklearn.neighbors import KNeighborsClassifier
7 from sklearn.metrics import classification_report,confusion_matrix,accuracy_score 8
9
10 #Load iris dataset & do train_test_split 11
iris=datasets.load_iris()
12 x,y=iris.data[:,:],iris.target
13 x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,random_state=20,train_size=.8)
14
https://colab.research.google.com/drive/16QqZPmMNfL3qz3zNmM_2SBKtZ9o60g06?usp=sharing
3/7/22, 8:03 PM CO2 Simple linear regression SREEJITH T SHAJI
422Colaboratory
1 #Feature Scaling
2 scaler=preprocessing.StandardScaler().fit(x_train)
3 x_train=scaler.transform(x_train)
4 x_test=scaler.transform(x_test)
5 x_train
https://colab.research.google.com/drive/16QqZPmMNfL3qz3zNmM_2SBKtZ9o60g06?usp=sharing
3/7/22, 8:03 PM CO2 Simple linear regression SREEJITH T SHAJI
422Colaboratory
In this step, we introduce the class GaussianNB that is used from the sklearn.naive_bayes library. Here, we have
used a Gaussian model, there are several other models such as Bernoulli, Categorical and Multinomial. Here, we
assign the GaussianNB class to the variable classifier and fit the X_train and y_train values to it for
training purpose.
1 #Implement Naive Bayes
2 from sklearn.naive_bayes import GaussianNB 3
darsana=GaussianNB()
4 darsana.fit(x_train,y_train)
GaussianNB()
1.0
confusion matrix
[[10 0 0]
[ 0 10 0]
[ 0 0 10]]
classification report
precision recall f1-score support
accuracy 1.00 30
macro avg 1.00 1.00 1.00 30
weighted avg 1.00 1.00 1.00 30
1 #bernoulli
2 from sklearn.naive_bayes import BernoulliNB
3 darsana= BernoulliNB()
4 darsana.fit(x_train,y_train)
5 y_test_pred=darsana.predict(x_test)
6 print(accuracy_score(y_test,y_test_pred))
7 print("confusion matrix")
8 print(confusion_matrix(y_test,y_test_pred))
9 print("classification report")
10 print(classification_report(y_test,y_test_pred))
11
https://colab.research.google.com/drive/16QqZPmMNfL3qz3zNmM_2SBKtZ9o60g06?usp=sharing
3/7/22, 8:03 PM CO2 Simple linear regression SREEJITH T SHAJI
422Colaboratory
0.7
confusion matrix
[[10 0 0]
[ 2 2 6]
[ 0 1 9]]
classification report
precision recall f1-score support
accuracy 0.70 30
macro avg 0.70 0.70 0.65 30
weighted avg 0.70 0.70 0.65 30
From the above confusion matrix, we infer that, out of 45 test set data, 44 were correctly classified and only 1 was
incorrectly classified. This gives us a high accuracy of 97.7
https://colab.research.google.com/drive/16QqZPmMNfL3qz3zNmM_2SBKtZ9o60g06?usp=sharing
3/7/22, 8:03 PM CO2 Simple linear regression SREEJITH T SHAJI
422Colaboratory
Program : 5
Aim : Program to implement simple linear regression technique using any standard dataset available in the public domain and
evaluate its performance.
Problem Statement
As an owner of a startup, you wish to forecast the sales of your product to plan how much money should be spent on advertisements. This is because
the sale of a product is usually proportional to the money spent on advertisements.
Predict the impact of TV advertising on your product sales by performing simple linear regression analysis.
List of Activities
Activity 1: Analysing the dataset
Activity 2: Train-Test split
Activity 3: Model training
Activity 4: Plotting the best fit line
Activity 5: Model prediction
Also, print the first five rows of the dataset. Check for null values and treat them accordingly.
1 # Import modules 2
import numpy as np
3 import pandas as pd
4 import matplotlib.pyplot as plt 5
import seaborn as sns
6
7 # Load the dataset
8 ad_df=pd.read_csv('https://raw.githubusercontent.com/jiss-sngce/CO_3/main/advertising.csv') 9 # Print
first five rows using head() function
10 ad_df.head()
1 # Check if there are any null values. If any column has null values, treat them accordingly 2
ad_df.isnull().sum()
TV 0
Radio 0
Newspaper 0
Sales 0
dtype: int64
https://colab.research.google.com/drive/16QqZPmMNfL3qz3zNmM_2SBKtZ9o60g06?usp=sharing
3/7/22, 8:03 PM CO2 Simple linear regression SREEJITH T SHAJI
422Colaboratory
Activity 2: Train-Test Split
For simple linear regression, consider only the effect of TV ads on sales. Thus, TV is the feature variable and Sales is the target variable.
Split the dataset into training set and test set such that the training set contains 67% of the instances and the remaining instances will become the
test set.
Train the simple regression model using training data to obtain the best fit line y = mx + c. For this, perform the following tasks:
1. Create following two functions:
A function errors_product() that calculates the errors for the feature and target variables i.e. (xi − x)(yi − y )
A function squared_errors() that calculates the squared errors for the feature variable only i.e. (xi − x)2
2. Calculate the slope and intercept values for the best fit line by applying the following formulae:
∑(xi − x)(yi − y ) errors_product(). sum()
slope ⇒ m = =
∑(xi − x)2 squared_errors(). sum()
intercept ⇒ c = y − mx
1 # Calculate the slope and intercept values for the best fit line. 2 slope =
errors_product().sum() / squared_errors().sum()
3 slope
4 round(slope,3)
5
0.057
1 inc=y_train.mean()-slope*x_train.mean()
2 inc
3 round(inc,3)
6.767
Q: What is the equation obtained for the best fit line of this model?
A:slope(m)=0.057
intercept(c)=6.767
sales=0.057*tv+6.767
1 # Plot the regression line in the scatter plot between Sales and TV advertisment values. 2
plt.style.use('dark_background')
3 plt.figure(figsize=(16,14))
4 plt.scatter(ad_df['TV'],ad_df['Sales'])
5 plt.plot(ad_df['TV'],slope * ad_df['TV'] + inc,color='r', label='y=0.057*x+6.767') 6
plt.xlabel('Tv')
https://colab.research.google.com/drive/16QqZPmMNfL3qz3zNmM_2SBKtZ9o60g06?usp=sharing
3/7/22, 8:03 PM CO2 Simple linear regression SREEJITH T SHAJI
7 plt.ylabel('Sales') 422Colaboratory
8 plt.legend()
9 plt.show()
https://colab.research.google.com/drive/16QqZPmMNfL3qz3zNmM_2SBKtZ9o60g06?usp=sharing
3/7/22, 8:05 PM 9/12/21 SREEJITH T SHAJI simple linear CO3
1 #Create a function which takes TV advertisement value as an input and returns the sales. 2 def
sales_predicted(tv_bd):
3 return 0.057*tv_bd+6.767
4
5
6 # Calculating sales value against $50,000 spent in TV ads 7
bd=sales_predicted(50)
8 bd*1000
9617.0
Q: If you are planning to invest $50,000 dollars in TV advertising, how many unit of sales can be predicted according to this simple linear
regression model?
A: $9617
1 x_train.shape
(134,)
1 y_train.shape
(134,)
1 type(x_train)
pandas.core.series.Series
[6.76732677]
[[0.05729132]
https://colab.research.google.com/drive/1qZ9sw3UQdRtk6g_dM8KBnqP2zKDv_Sre#scrollTo=RKKT9L8nIfL_&printMode=true 8/
3/7/22, 8:08 PM SREEJITH T SHAJI MultipleLinearRegression - Colaboratory
Program : 6
Aim : Program to implement multiple linear regression technique using any standard dataset available in the public
domain and evaluate its performance.
MultipleLinearRegression
Program to implement multiple linear regression technique using any standard dataset available in the public domain and evaluate its performance.
The description for all the columns containing data for air pollutants, temperature, relative humidity and absolute humidity is provided below.
Columns Description
PT08.S1(CO) PT08.S1 (tin oxide) hourly averaged sensor response (nominally CO targeted)
μg
C6H6(GT) True hourly averaged Benzene concentration in 3
m
PT08.S2(NMHC)
PT08.S3(NOx) PT08.S2 (titania) hourly averaged sensor response (nominally NMHC targeted)
PT08.S3 (tungsten oxide) hourly averaged sensor response (nominally NO targeted)
PT08.S4(NO2) PT08.S4 (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted)
PT08.S5(O3) PT08.S5 (indium oxide) hourly averaged sensor response (nominally O3 targeted)
T Temperature in °C x
AH AH Absolute Humidity
2004-03-
0 10 1360.0 11.9 1046.0 1056.0 1692.0 1268.0 13.6 48.9
18:00:00
2004-03-
1 10 1292.0 9.4 955.0 1174.0 1559.0 972.0 13.3 47.7
19:00:00
2004-03-
2 10 1402 0 90 939 0 1140 0 1555 0 1074 0 11 9 54 0
<bound method DataFrame.info of DateTime PT08.S1(CO) C6H6(GT) ... Month Day Day Name
0 2004-03-10 18:00:00 1360.0 11.9 ... 3 10 Wednesday
1 2004-03-10 19:00:00 1292.0 9.4 ... 3 10 Wednesday
2 2004-03-10 20:00:00 1402.0 9.0 ... 3 10 Wednesday
3 2004-03-10 21:00:00 1376.0 9.2 ... 3 10 Wednesday
4 2004-03-10 22:00:00 1272.0 6.5 ... 3 10 Wednesday
... ... ... ... ... ... ... ...
9352 2005-04-04 10:00:00 1314.0 13.5 ... 4 4 Monday
9353 2005-04-04 11:00:00 1163.0 11.4 ... 4 4 Monday
9354 2005-04-04 12:00:00 1142.0 12.4 ... 4 4 Monday
9355 2005-04-04 13:00:00 1003.0 9.5 ... 4 4 Monday
9356 2005-04-04 14:00:00 1071.0 11.9 ... 4 4 Monday
1 # Build a linear regression model using the sklearn module by including all the features except DateTime,Day Name & RH. 2
3 from sklearn.model_selection import train_test_split 4
from sklearn.linear_model import LinearRegression
5 features=list(df.columns.values[1:-1])
6 features.remove('RH')
7 X=df[features]
https://colab.research.google.com/drive/1wWepOEZZlbxKa8Ik128t8pS7kJ0ehxgx#scrollTo=-BxonYJweOlM&printMode=true 1/
3/7/22, 8:08 PM SREEJITH T SHAJI MultipleLinearRegression - Colaboratory
8 y=df['RH']
9
10 # Splitting the DataFrame into the train and test sets. 11 #
Test set will have 33% of the values.
12
Intercept -15028.451823247718
coefficent : [[ 1.48327948e-02 -9.03464156e-01 -5.88095941e-03 1.50325488e-03
2.64965020e-02 -1.06574176e-03 -2.35491907e+00 2.95517421e+01
7.50515310e+00 1.16786097e+00 3.52321248e-02]]
PT08.S1(CO) 0.014832794792690625
C6H6(GT) -0.9034641560183382
PT08.S2(NMHC) -0.005880959405385411
PT08.S3(NOx) 0.0015032548783276978
PT08.S4(NO2) 0.026496502045666503
PT08.S5(O3) -0.001065741763271788
T -2.354919067592639
AH 29.551742104329783
Year 7.505153097892558
Month 1.1678609682998067
Day 0.03523212478929974
1 # Evaluate the linear regression model using the 'r2_score', 'mean_squared_error' & 'mean_absolute_error' functions of the 'skl 2
3 from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error 4 import
numpy as np
5 y_train_pred=sklearn_lin_reg.predict(X_train)
6 y_test_pred=sklearn_lin_reg.predict(X_test)
7 print('Train Set')
8 print('R-squared : ',r2_score(y_train_reshaped,y_train_pred))
9 print('mean squared error : ',mean_squared_error(y_train_reshaped,y_train_pred))
10 print('root mean squared error : ',np.sqrt(mean_squared_error(y_train_reshaped,y_train_pred))) 11
print('mean absolute error : ',mean_absolute_error(y_train_reshaped,y_train_pred))
12 print('\nTest set')
13 print('R-squared : ',r2_score(y_test_reshaped,y_test_pred))
14 print('mean squared error : ',mean_squared_error(y_test_reshaped,y_test_pred))
15 print('root mean squared error : ',np.sqrt(mean_squared_error(y_test_reshaped,y_test_pred))) 16
print('mean absolute error : ',mean_absolute_error(y_test_reshaped,y_test_pred))
17
18
Train Set
R-squared : 0.8785638240066055
mean squared error : 35.11591834141915
root mean squared error : 5.925868572742662 mean
absolute error : 4.571994849644625
Test set
R-squared : 0.8787020691681189
mean squared error : 34.702124455429534
root mean squared error : 5.8908509109830245 mean
absolute error : 4.564460432924346
https://colab.research.google.com/drive/1wWepOEZZlbxKa8Ik128t8pS7kJ0ehxgx#scrollTo=-BxonYJweOlM&printMode=true 2/
3/7/22, 8:10 PM SREEJITH T SHAJI Support Vector Machines - MNIST Digits Classification -
Colaboratory
Program : 7
Aim : Program to implement support vector machine
Loading Data
Let's load both the training and the test datasets.
Train Dataset: https://raw.githubusercontent.com/akshayr89/MNSIST_Handwritten_Digit_Recognition-SVM/master/train.csv Test Dataset:
(42000, 785)
(28000, 784)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB
There are 42000 rows and 785 columns in the training dataset.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Columns: 784 entries, pixel0 to pixel783
dtypes: int64(784)
memory usage: 167.5 MB
There are 28000 rows and 784 columns in the test dataset. This means we don't have the labels column for the test set.
1 # Print the first and last five columns of both the test and train datasets. 2
print(" train data")
3 print("first 5 columns : ",list(train_df.columns[:5]))
4 print("last 5 columns : ",list(train_df.columns[-5:]))
5 print()
6
7 print("test data")
8 print("first 5 columns : ",list(test_df.columns[:5]))
9 print("last 5 columns : ",list(test_df.columns[-5:]))
train data
first 5 columns : ['label', 'pixel0', 'pixel1', 'pixel2', 'pixel3']
last 5 columns : ['pixel779', 'pixel780', 'pixel781', 'pixel782', 'pixel783']
test data
first 5 columns : ['pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4']
last 5 columns : ['pixel779', 'pixel780', 'pixel781', 'pixel782', 'pixel783']
https://colab.research.google.com/drive/1yRxpEhgaSYM7uHL5ruJ-t6KwBKrRc90R#scrollTo=vhwhnTmmuQVO&printMode=true 1/
3/7/22, 8:10 PM SREEJITH T SHAJI Support Vector Machines - MNIST Digits Classification -
Colaboratory
As you can see, the train set has the label column but the test set doesn't.
Now, let s print the first five rows of the data frame containing the train set.
1 # Print the first ten rows of the data frame containing the train set. 2
The first row contains the pixel values of the image of the handwritten digit 1.
Similarly, the second row contains the pixel values of the image of the handwritten digit 0
The 10th row contains the pixel values of the image of the handwritten digit 3.
1. Create a 1D array containing the pixel values from the training data frame for the image and store it in a variable.
2. Then reshape the above array into a 2D array having 28 rows and 28 columns.
parameters cmap = 'gray', vmin = 0, vmax = 255 .
Note: There are other parameters that can be passed to imshow() function as inputs. But for now, we will pass the above parameters only.
1 # Display the image of the handwritten digit 4 from the train data frame. 2
four_pixels = train_df.iloc[3, 1:]
3 four_pixels = four_pixels.values.reshape(28, 28)
4 plt.figure(figsize = (5, 5), dpi = 81)
5 plt.title("Handwritten Digit 4", fontsize = 16)
6 plt.imshow(four_pixels, cmap = 'gray', vmin = 0, vmax = 255) 7
plt.show()
https://colab.research.google.com/drive/1yRxpEhgaSYM7uHL5ruJ-t6KwBKrRc90R#scrollTo=vhwhnTmmuQVO&printMode=true 2/
3/7/22, 8:10 PM SREEJITH T SHAJI Support Vector Machines - MNIST Digits Classification -
Colaboratory
four_pixels = train_df.iloc[3, 1:] part gets the pixel values of the image of the digit 4
that are stored in the 4th row of the data frame.
four_pixels = four_pixels.values.reshape(28, 28) part first gets the pixel values from the Pandas series in the form of a NumPy array and then
reshapes the 1D array into a 2D array having 28 rows and 28 columns.
plt.figure(figsize = (5, 5), dpi = 81) part sets the figure size.
plt.title("Handwritten Digit 4", fontsize = 16) part sets the title of the plot.
plt.imshow(four_pixels, cmap = 'gray', vmin = 0, vmax = 255) part creates a 2D image in gray colour.
If you look at the axes of the above image, you can see that nearly the first four and last three rows are blank. Similarly, the first five and last five
columns are blank which is denoted by the black colour. So let's print the rows 5 to 26 and columns 5 to 25 of the four_pixel NumPy array to see the
pixel values of the image of the handwritten digit 4.
1 # Print the rows 5 to 26 and columns 5 to 23 of the 'four_pixel' NumPy array to see the pixel values of the image of the handwr 2
print(four_pixels[4:26,5:23])
[[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[ 0 220 179 6 0 0 0 0 0 0 0 0 9 77 0 0 0 0]
[ 0 28 247 17 0 0 0 0 0 0 0 0 27 202 0 0 0 0]
[ 0 0 242 155 0 0 0 0 0 0 0 0 27 254 63 0 0 0]
[ 0 0 160 207 6 0 0 0 0 0 0 0 27 254 65 0 0 0]
[ 0 0 127 254 21 0 0 0 0 0 0 0 20 239 65 0 0 0]
[ 0 0 77 254 21 0 0 0 0 0 0 0 0 195 65 0 0 0]
[ 0 0 70 254 21 0 0 0 0 0 0 0 0 195 142 0 0 0]
[ 0 0 56 251 21 0 0 0 0 0 0 0 0 195 227 0 0 0]
[ 0 0 0 222 153 5 0 0 0 0 0 0 0 120 240 13 0 0]
[ 0 0 0 67 251 40 0 0 0 0 0 0 0 94 255 69 0 0]
[ 0 0 0 0 234 184 0 0 0 0 0 0 0 19 245 69 0 0]
[ 0 0 0 0 234 169 0 0 0 0 0 0 0 3 199 182 10 0]
[ 0 0 0 0 154 205 4 0 0 26 72 128 203 208 254 254 131 0]
[ 0 0 0 0 61 254 129 113 186 245 251 189 75 56 136 254 73 0]
[ 0 0 0 0 15 216 233 233 159 104 52 0 0 0 38 254 73 0]
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 254 73 0]
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 254 73 0]
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 206 106 0]
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 186 159 0]
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 209 101 0]
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
From the above output, you can see the non-zero pixel values arranged in the pattern of digit 4.
It is to be noted that the pixel values for a grayscale image range from 0 to 255.
You can also look at the descriptive statistics for the first 10 images in the train data frame.
1 # Create a data frame from the training data frame that contain the pixel values of the images of the digit 6. 2
six_pixels_train_df=train_df.loc[train_df['label'] == 6, :]
3 six_pixels_train_df
https://colab.research.google.com/drive/1yRxpEhgaSYM7uHL5ruJ-t6KwBKrRc90R#scrollTo=vhwhnTmmuQVO&printMode=true 3/
3/7/22, 8:10 PM SREEJITH T SHAJI Support Vector Machines - MNIST Digits Classification -
Colaboratory
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 pixel10
21 6 0 0 0 0 0 0 0 0 0 0 0
26 6 0 0 0 0 0 0 0 0 0 0 0
.
64 6 0 0 0 0 0 0 0 0 0 0 0
72 6 0 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ...
41921 6 0 0 0 0 0 0 0 0 0 0 0
41927 6 0 0 0 0 0 0 0 0 0 0 0
41967 6 0 0 0 0 0 0 0 0 0 0 0
41993 6 0 0 0 0 0 0 0 0 0 0 0
41998 6 0 0 0 0 0 0 0 0 0 0 0
45 6 0 0 0 0 0 0 0 0 0 0 0
# Create an image from the pixel values of the image of the digit 6 that are stored in row 21.
six_pixels = train_df.iloc[21, 1:]
six_pixels = six_pixels.values.reshape(28, 28)
plt.figure(figsize = (5, 5), dpi = 81)
plt.title("Handwritten Digit 6", fontsize = 16)
plt.imshow(six_pixels, cmap = 'gray', vmin = 0, vmax = 255) 7 plt.show()
Now, let's print the part of the array containing the pixel values of the above image such that their arrangement resembles the digit 6.
https://colab.research.google.com/drive/1yRxpEhgaSYM7uHL5ruJ-t6KwBKrRc90R#scrollTo=vhwhnTmmuQVO&printMode=true 4/
3/7/22, 8:10 PM SREEJITH T SHAJI Support Vector Machines - MNIST Digits Classification -
Colaboratory
1 # S3.8: Print the rows 2 to 22 and columns 5 to 21 of the 'six_pixels' array. 2
print(six_pixels[2:22,5:21])
[[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2]
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 70]
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 27 189 254]
[ 0 0 0 0 0 0 0 0 0 0 0 0 28 219 255 206]
[ 0 0 0 0 0 0 0 0 0 0 8 94 233 248 179 31]
[ 0 0 0 0 0 0 0 0 0 0 146 254 251 84 0 0]
[ 0 0 0 0 0 0 0 0 51 173 252 209 65 0 0 0]
[ 0 0 0 0 0 0 2 119 252 254 146 20 0 0 0 0]
[ 0 0 0 0 0 18 131 254 239 130 25 0 0 0 0 0]
[ 0 0 0 0 17 237 254 239 58 0 0 0 0 0 0 20]
[ 0 0 4 70 223 251 196 61 0 0 0 30 112 138 207 226]
[ 0 0 153 254 228 68 0 0 0 34 143 249 254 233 177 179]
[ 0 67 253 208 40 0 0 31 99 226 241 195 112 14 0 18]
[ 67 241 168 8 0 0 60 239 253 161 37 0 0 0 20 165]
[185 254 74 0 0 43 224 254 116 0 0 0 3 73 205 253]
[252 121 1 0 47 205 230 53 2 0 0 53 176 254 219 118]
[254 107 2 1 127 254 65 5 24 107 198 250 252 195 27 0]
[234 254 199 172 254 254 186 254 254 254 234 134 53 0 0 0]
[109 195 233 250 254 254 254 244 129 46 20 0 0 0 0 0]
[ 0 0 24 71 254 254 254 235 84 0 0 0 0 0 0 0]]
Now, for a machine learning algorithm (in this case, SVM), to correctly identify an image for a digit, it has to figure out the arrangement of pixel values for a
digit on a 2D grid (in this case, 28×28 grid). Knowing this, we can now build a machine learning model (in this case, SVM) to classify the images of
different handwritten digits.
1 # Find out the counts of records for each digit in the training dataset. 2
1 11.152381
7 10.478571
3 10.359524
9 9.971429
2 9.945238
6 9.850000
0 9.838095
4 9.695238
8 9.673810
5 9.035714
Name: label, dtype: float64
Note:
1. The dropna = False parameter counts the number of NA or null values if they are present in a Pandas series.
2. The normalize = True parameter calculates the count of a value as the fraction of the total number of records.
From the count of labels, we can see that the training dataset is balanced. Hence, we can now proceed to build a classification model.
https://colab.research.google.com/drive/1yRxpEhgaSYM7uHL5ruJ-t6KwBKrRc90R#scrollTo=vhwhnTmmuQVO&printMode=true 5/
3/7/22, 8:10 PM SREEJITH T SHAJI Support Vector Machines - MNIST Digits Classification -
Colaboratory
So let's divide each pixel value for each image by 255 (the greatest pixel value for a grayscale image) to reduce the values between 0 and 1.
1 # Create features and target data frames and divide each pixel for each image by 255.0 2
feature_train=train_df.iloc[:,1:]/255.0
3 target_train_actual=train_df['label']
4 feature_train.set_index(keys=target_train_actual,inplace = False).T.describe()
label 1 0 1 4 0 0 7 3
count 784.000000 784.000000 784.000000 784.000000 784.000000 784.000000 784.000000 784.000000 784
SVC(kernel='linear')
Now that we have built a classification model using support vector machines, let's get the predicted digts and them compare the predicted values with
the actual values.
https://colab.research.google.com/drive/1yRxpEhgaSYM7uHL5ruJ-t6KwBKrRc90R#scrollTo=vhwhnTmmuQVO&printMode=true 6/
3/7/22, 8:10 PM SREEJITH T SHAJI Support Vector Machines - MNIST Digits Classification -
Colaboratory
array([[4130, 0, 0, 0, 0, 1, 1, 0, 0, 0],
[ 0, 4674, 2, 1, 0, 0, 0, 0, 6, 1],
[ 2, 7, 4092, 16, 13, 3, 6, 9, 27, 2],
[ 6, 3, 48, 4188, 1, 49, 0, 5, 38, 13],
[ 2, 6, 3, 1, 3999, 0, 1, 3, 0, 57],
[ 4, 8, 12, 67, 4, 3649, 19, 0, 29, 3],
[ 1, 0, 2, 1, 4, 11, 4116, 0, 2, 0],
[ 2, 3, 22, 4, 10, 1, 0, 4308, 2, 49],
[ 11, 30, 19, 60, 2, 49, 3, 2, 3880, 7],
[ 4, 8, 2, 12, 61, 6, 0, 76, 11, 4008]])
1 #Print the precision, recall and f1-score values to further evaluate the efficacy of the model. 2
print(classification_report(target_train_actual,target_train_pred))
The f1-scores for all the labels (or digits) are almost equal to 1. This implies that the SVC model built to classify digits is very accurate. So now let's
predict the digits on the test set.
1 # Divide each pixel value in the test set by 255. Also, for each image pixels, print the minimum and maximum pixel values. 2
feature_test=test_df/255
Now let's predict the digits for the test set using the SVC model that we just built.
Note: The code below may take 3 to 5 minutes to execute.
1 # Predict the digits for the test set using the SVC model built above. 2
target_test_pred=svc_dp_linear.predict(feature_test)
3 target_test_pred
Now let's get the count of the predicted labels (or handwritten digits) to see their distribution.
1 # Get the count of the predicted labels (or handwritten digits) to see their distribution.
2 pd.Series(target_test_pred).value_counts()
1 3288
2 2882
7 2868
3 2818
0 2810
4 2808
6 2729
9 2677
8 2609
5 2511
dtype: int64
https://colab.research.google.com/drive/1yRxpEhgaSYM7uHL5ruJ-t6KwBKrRc90R#scrollTo=vhwhnTmmuQVO&printMode=true 7/
3/7/22, 8:10 PM SREEJITH T SHAJI Support Vector Machines - MNIST Digits Classification -
It seems that the handwritten digits in the test Colaboratory
set are quite uniformly distributed.
https://colab.research.google.com/drive/1yRxpEhgaSYM7uHL5ruJ-t6KwBKrRc90R#scrollTo=vhwhnTmmuQVO&printMode=true 8/
3/7/22, 8:10 PM SREEJITH T SHAJI Support Vector Machines - MNIST Digits Classification -
Colaboratory
1 # Add 'label' at column index = 0 to the 'test_df' data frame so that its values are the predicted labels (or digits). 2
test_df.insert(loc =0,column='label',value=target_test_pred)
Lets's display the first 5 rows of the modified test_df data frame.
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 pixel10 pixe
0 2 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0 0
2 5 0 0 0 0 0 0 0 0 0 0 0
3 4 0 0 0 0 0 0 0 0 0 0 0
4 3 0 0 0 0 0 0 0 0 0 0 0
Now let's group all the rows of the test_df data frame by the label column so that pixel values of images of a digit can be clubbed together and
a sample of a digit can be retrieved easily later.
Eg., you can easily retrieve one of the sample images of digit 0 from a data frame containing pixel values of all the image samples of the digit 0 only.
1 # Group all the rows of the 'test_df' data frame by the 'label' column. Also, get a data frame containing pixel values of imag 2
grouped_test_df = test_df.groupby(by = "label")
3 zeros_test_df = grouped_test_df.get_group(0) 4
zeros_test_df
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 pixel10
1 0 0 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0 0 0 0 0
13 0 0 0 0 0 0 0 0 0 0 0 0
19 0 0 0 0 0 0 0 0 0 0 0 0
https://colab.research.google.com/drive/1yRxpEhgaSYM7uHL5ruJ-t6KwBKrRc90R#scrollTo=vhwhnTmmuQVO&printMode=true 9/
3/7/22, 8:10 PM SREEJITH T SHAJI Support Vector Machines - MNIST Digits Classification -
Colaboratory
... ... ... ... ... ... ... ... ... ... ... ... ...
Now, let's create an image from the pixel values of one of the samples of digit 0.
27967 0 0 0 0 0 0 0 0 0 0 0 0
1 # Create27971
an image from0the pixel values
0 of one0of the samples
0 of digit00. 2 sample_of_zero_test_pixels
0 0 0= test_df.iloc[6,
0 0 0
1:].values.reshape(28, 28) 0
3
27974 0 0 0 0 0 0 0 0 0 0 0 0
plt.figure(figsize = (6, 6), dpi = 81)
plt.title("Handwritten
27977 Digit
0 0 Image",0 fontsize =016) 0 0 0 0 0 0 0 0 0
plt.imshow(sample_of_zero_test_pixels, cmap = "gray", vmin = 0, vmax = 255) 7 plt.show()
27983 0 0 0 0 0 0 0 0 0 0 0 0
Indeed the predicted image is 0. Let's create an image of one of the sample images of digit three.
1 # Get a data frame containing pixel values of all images of digit 3 from 'grouped_test_df' data frame. 2
grouped_test_df = test_df.groupby(by = 'label')
3 zeros_test_df = grouped_test_df.get_group(3) 4
zeros_test_df
5
6
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 pixel10
Now, let's create an image of one of the sample images of digit 3.
# Create an image4 of one of 3 the sample
0 images of 0 digit 3. 0 0 0 0 0 0 0 0 0
sample_of_zero_test_pixels = test_df.iloc[4, 1:].values.reshape(28, 28)
3
plt.figure(figsize = (6, 6), dpi = 81)
plt.title("Handwritten Digit 3 Image", fontsize = 16)
plt.imshow(sample_of_zero_test_pixels, cmap = "gray", vmin = 0, vmax = 255) 7 plt.show()
https://colab.research.google.com/drive/1yRxpEhgaSYM7uHL5ruJ-t6KwBKrRc90R#scrollTo=vhwhnTmmuQVO&printMode=true 1
3/7/22, 8:11 PM SREEJITH T SHAJI K means -CO4 -
Colaboratory
Program : 8
Aim : Program to implement k-means clustering technique using any standard dataset available in the public
Problem Statement
Program to implement k-means clustering technique using any standard dataset available in the public domain
Dataset Description
In this project, we will be using the dataset holding the information of carbon dioxide emission from different car models. The
Volume Total space available inside the car (in litres) Weight
List of Activities
Activity 1: Import Modules and Read Data
Activity 2: Data Cleaning
Activity 3: Find Optimal Value of K
Activity 4: Plot Silhouette Scores
Read the data from a CSV file to create a Pandas DataFrame and go through the necessary data-cleaning process (if required).
Dataset link: https://raw.githubusercontent.com/jiss-sngce/CO_3/main/jkcars.csv
1 # Import the modules and Read the data. 2
3 import numpy as np 4
import pandas as pd
5 df=pd.read_csv('https://raw.githubusercontent.com/jiss-sngce/CO_3/main/jkcars.csv')
6
7 # Print the first five records 8
9 df.head()
1 # Get the total number of rows and columns, data types of columns and missing values (if exist) in the dataset 2 df.shape
https://colab.research.google.com/drive/12BW12BwUPCYFJUA8ak7qGze5omBItkEX#scrollTo=P7DeDzA-DXUF&printMode=true 1/
3/7/22, 8:11 PM SREEJITH T SHAJI K means -CO4 -
Colaboratory
3 df.dtypes
4 df.isnull().sum()
Car 0
Model 0
Volume 0
Weight 0
CO2 0
dtype: int64
1. Create a subset of the dataset consisting of three columns i.e Volume , Weight , and CO2 .
0 1200 1160 95
1 1000 929 95
2 900 865 90
2. Compute K-Means clustering for the 3D dataset data_3d by varying K from 2 to 10 clusters. Also, for each K , calculate silhouette score
using silhouette_score function.
Steps to Follow
Create an empty list to store silhouette scores obtained for each K (let's say sil_scores ). Initiate a
Perform K-means clustering for the current value of K inside for loop. Use
Calculate silhouette score for current K value using silhouette_score() function and append it to the empty list sil_scores .
Create a DataFrame with two columns. The first column must contain K values from 2 to 10 and the second column must contain
silhouette values obtained after the for loop.
https://colab.research.google.com/drive/12BW12BwUPCYFJUA8ak7qGze5omBItkEX#scrollTo=P7DeDzA-DXUF&printMode=true 2/
3/7/22, 8:11 PM SREEJITH T SHAJI K means -CO4 -
Colaboratory
K value silhouette_score
0 2 0.466982
1 3 0.569304
2 4 0.506027
3 5 0.537547
4 6 0.549792
5 7 0.525962
6 8 0.509034
7 9 0.461402
8 10 0.434958
Q: What are the maximum silhouette score and the corresponding cluster value?
A:
https://colab.research.google.com/drive/12BW12BwUPCYFJUA8ak7qGze5omBItkEX#scrollTo=P7DeDzA-DXUF&printMode=true 3/
3/7/22, 8:11 PM SREEJITH T SHAJI K means -CO4 -
Colaboratory
Activity 4: Plot silhouette Scores & WCSS Scores to find optimal value for K
Create a line plot with K ranging from 2 to 10 on the x-axis and the silhouette scores stored in sil_scores list on the y -axis.
3 plt.figure(figsize=(14,5))
4 plt.plot(clusters,sil_scores)
5 plt.xlabel("K value")
6 plt.ylabel("silhouette_score")
7 plt.xticks(range(2,11))
8 plt.grid()
9 plt.show()
https://colab.research.google.com/drive/12BW12BwUPCYFJUA8ak7qGze5omBItkEX#scrollTo=P7DeDzA-DXUF&printMode=true 4/
3/7/22, 8:11 PM SREEJITH T SHAJI K means -CO4 -
Colaboratory
2 16
1 9
0 7
dtype: int64
1 cluster_labels
1 df.columns
https://colab.research.google.com/drive/12BW12BwUPCYFJUA8ak7qGze5omBItkEX#scrollTo=P7DeDzA-DXUF&printMode=true 5/
3/7/22, 8:11 PM SREEJITH T SHAJI K means -CO4 -
Colaboratory
Car Model Volume Weight CO2 label
https://colab.research.google.com/drive/12BW12BwUPCYFJUA8ak7qGze5omBItkEX#scrollTo=P7DeDzA-DXUF&printMode=true 6/
3/7/22, 8:57 PM SREEJITH T SHAJI T EXP9_HandWrittenDigitRecognition.ipynb
Program : 9
Aim : Programs on convolutional neural network to classify images from any standard dataset in the public domain.
Importing Necessary Libraries
1 import tensorflow as tf
2 from tensorflow import keras
3 import matplotlib.pyplot as plt 4
import numpy as np
5
6 #load MNIST dataset available in Keras library
7 (X_train, y_train) , (X_test, y_test) = keras.datasets.mnist.load_data()
(28, 28)
1 #matshow() function OR imshow() function is used to represent an array as a matrix in a new figure window. 2
#plt.matshow(X_train[0]) OR plt.imshow(X_train[2])
3 plt.imshow(X_train[0])
4
<matplotlib.image.AxesImage at 0x7f760be9f990>
https://colab.research.google.com/drive/1P2w79xKLHyWOyLm0UXlAuVlP9h2DTHvQ#scrollTo=eoaqru1BouMK&printMode=true 7/
3/7/22, 8:57 PM SREEJITH T SHAJI T EXP9_HandWrittenDigitRecognition.ipynb
array([[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.01176471, 0.07058824, 0.07058824,
0.07058824, 0.49411765, 0.53333333, 0.68627451, 0.10196078,
0.65098039, 1. , 0.96862745, 0.49803922, 0. ,
0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0.11764706, 0.14117647,
0.36862745, 0.60392157, 0.66666667, 0.99215686, 0.99215686,
0.99215686, 0.99215686, 0.99215686, 0.88235294, 0.6745098 ,
0.99215686, 0.94901961, 0.76470588, 0.25098039, 0. ,
0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.19215686, 0.93333333, 0.99215686,
0.99215686, 0.99215686, 0.99215686, 0.99215686, 0.99215686,
0.99215686, 0.99215686, 0.98431373, 0.36470588, 0.32156863,
0.32156863, 0.21960784, 0.15294118, 0. , 0. ,
0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.07058824, 0.85882353, 0.99215686,
0.99215686, 0.99215686, 0.99215686, 0.99215686, 0.77647059,
0.71372549, 0.96862745, 0.94509804, 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0.31372549, 0.61176471,
0.41960784, 0.99215686, 0.99215686, 0.80392157, 0.04313725,
0. , 0.16862745, 0.60392157, 0. , 0. ,
(60000, 784)
1 X_test_flattened.shape
(10000, 784)
https://colab.research.google.com/drive/1P2w79xKLHyWOyLm0UXlAuVlP9h2DTHvQ#scrollTo=eoaqru1BouMK&printMode=true 8/
3/7/22, 8:57 PM SREEJITH T SHAJI T EXP9_HandWrittenDigitRecognition.ipynb
Epoch 1/10
1875/1875 [==============================] - 2s 1ms/step - loss: 0.4654 - accuracy: 0.8789
Epoch 2/10
1875/1875 [==============================] - 2s 1ms/step - loss: 0.3033 - accuracy: 0.9158
Epoch 3/10
1875/1875 [==============================] - 2s 1ms/step - loss: 0.2828 - accuracy: 0.9212
Epoch 4/10
1875/1875 [==============================] - 2s 1ms/step - loss: 0.2730 - accuracy: 0.9244
Epoch 5/10
1875/1875 [==============================] - 2s 1ms/step - loss: 0.2670 - accuracy: 0.9258
Epoch 6/10
1875/1875 [==============================] - 2s 1ms/step - loss: 0.2621 - accuracy: 0.9269
Epoch 7/10
1875/1875 [==============================] - 2s 1ms/step - loss: 0.2583 - accuracy: 0.9281
Epoch 8/10
1875/1875 [==============================] - 2s 1ms/step - loss: 0.2557 - accuracy: 0.9295
Epoch 9/10
1875/1875 [==============================] - 2s 1ms/step - loss: 0.2531 - accuracy: 0.9299
Epoch 10/10
1875/1875 [==============================] - 2s 1ms/step - loss: 0.2510 - accuracy: 0.9311
313/313 [==============================] - 0s 930us/step - loss: 0.2653 - accuracy: 0.9266
array([3.0872768e-01, 3.6514401e-03, 9.9983525e-01, 1.5331820e-01,
9.3455755e-12, 9.1628104e-01, 9.2582357e-01, 2.9462129e-15,
1.1721602e-01, 2.1716836e-12], dtype=float32)
<matplotlib.image.AxesImage at 0x7f760bf3d450>
https://colab.research.google.com/drive/1P2w79xKLHyWOyLm0UXlAuVlP9h2DTHvQ#scrollTo=eoaqru1BouMK&printMode=true 9/
3/7/22, 8:57 PM SREEJITH T SHAJI T EXP9_HandWrittenDigitRecognition.ipynb
https://colab.research.google.com/drive/1P2w79xKLHyWOyLm0UXlAuVlP9h2DTHvQ#scrollTo=eoaqru1BouMK&printMode=true 1
3/7/22, 8:57 PM SREEJITH T SHAJI T EXP9_HandWrittenDigitRecognition.ipynb
1 #Display the predicted values for test dataset. Display only first 5 predicted values 2
[7, 2, 1, 0, 4]
1 model = keras.Sequential([
2 keras.layers.Dense(100,activation='relu'),
3 keras.layers.Dense(10, activation='sigmoid')
4 ])
5
1 model.compile(optimizer='adam',
2 loss='sparse_categorical_crossentropy',
3 metrics=['accuracy']) 4
Epoch 1/5
1875/1875 [==============================] - 5s 2ms/step - loss: 0.2777 - accuracy: 0.9199
Epoch 2/5
1875/1875 [==============================] - 4s 2ms/step - loss: 0.1245 - accuracy: 0.9636
Epoch 3/5
1875/1875 [==============================] - 4s 2ms/step - loss: 0.0875 - accuracy: 0.9743
Epoch 4/5
1875/1875 [==============================] - 4s 2ms/step - loss: 0.0681 - accuracy: 0.9792
Epoch 5/5
1875/1875 [==============================] - 4s 2ms/step - loss: 0.0537 - accuracy: 0.9835
<keras.callbacks.History at 0x7f6092d38290>
https://colab.research.google.com/drive/1P2w79xKLHyWOyLm0UXlAuVlP9h2DTHvQ#scrollTo=eoaqru1BouMK&printMode=true 1
3/7/22, 8:57 PM SREEJITH T SHAJI T EXP9_HandWrittenDigitRecognition.ipynb
t_flattened) 2 y_predicted
1
mod array([[1.9663182e-01, 1.0661781e-03, 7.0703042e-01, ..., 9.9997139e-01,
el.e 1.5860826e-02, 4.3196589e-02],
valu [1.9181639e-01, 9.9905562e-01, 9.9999869e-01, ..., 3.5023191e-08,
ate( 2.3184523e-01, 3.8511429e-05],
X_t [1.7279387e-04, 9.9927801e-01, 1.4470059e-01, ..., 4.1472608e-
est_
01, 1.3068342e-01, 4.1025519e-02],
flatt
...,
ene
d,y_ [1.7130093e-06, 8.7543085e-06, 7.7272634e-06, ..., 3.8716146e-
test) 01, 4.9101233e-02, 6.9311082e-01],
[2.6747435e-02, 1.8883626e-05, 8.4262902e-05, ..., 2.1099478e-
6 02, 6.4012915e-01, 1.1537667e-04],
[1.7884678e-01, 9.4580650e-04, 7.7513754e-03, ..., 3.5389752e-05,
- 2.2894144e-04, 5.7818677e-05]], dtype=float32)
a
c 1 y_predicted_labels = [np.argmax(i) for i in
c y_predicted] 2
u
r
1 y_predicted[0]
a
c
array([1.9663182e-01, 1.0661781e-03, 7.0703042e-01, 9.4925106e-01,
y
5.5770590e-07, 1.5124649e-02, 6.6381671e-07, 9.9997139e-
:
01,
0
1.5860826e-02, 4.3196589e-02], dtype=float32)
.
9 1 np.argmax(y_predicted[0])
7
5 7
1 1 y_predicted_labels = [np.argmax(i) for i in
y_predicted] 2 y_predicted_labels[:5]
[ 3
0
.
[7, 2, 1, 0, 4]
0
7
8 1 plt.matshow(X_test[0])
6
3 <matplotlib.image.AxesImage at 0x7f60938b09d0>
9
8
4
2
5
6
9
8
2
8
0
3
,
0
.
9
7
5
0
9
9
9
8
0
8
3
1
1
_
t
e
s
https://colab.research.google.com/drive/1P2w79xKLHyWOyLm0UXlAuVlP9h2DTHvQ#scrollTo=eoaqru1BouMK&printMode=true 1
3/7/22, 8:57 PM SREEJITH T SHAJI T EXP9_HandWrittenDigitRecognition.ipynb
model = keras.Sequential([
keras.layers.Flatten(input_shape=(28, 28)),
keras.layers.Dense(100, activation='relu'),
keras.layers.Dense(10, activation='sigmoid')
])
1 model.compile(optimizer='adam',
2 loss='sparse_categorical_crossentropy',
3 metrics=['accuracy']) 4
Epoch 1/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.2714 - accuracy: 0.9232
Epoch 2/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.1247 - accuracy: 0.9638
Epoch 3/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.0866 - accuracy: 0.9735
Epoch 4/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.0651 - accuracy: 0.9807
Epoch 5/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.0507 - accuracy: 0.9847
Epoch 6/10
1875/1875 [==============================] - 5s 3ms/step - loss: 0.0406 - accuracy: 0.9874
Epoch 7/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.0339 - accuracy: 0.9895
Epoch 8/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.0282 - accuracy: 0.9913
Epoch 9/10
1875/1875 [==============================] - 4s 2ms/step - loss: 0.0233 - accuracy: 0.9926
Epoch 10/10
1875/1875 [==============================] - 5s 3ms/step - loss: 0.0190 - accuracy: 0.9946
<keras.callbacks.History at 0x7f6095650d10>
1 model.evaluate(X_test,y_test)
https://colab.research.google.com/drive/1P2w79xKLHyWOyLm0UXlAuVlP9h2DTHvQ#scrollTo=eoaqru1BouMK&printMode=true 1
3/7/22, 8:15 PM CO5 Natural Language Toolkit SREEJITH T SHAJI
Program : 10
Aim: Implement problems on natural language processing - Part of Speech tagging, N-gram & smoothening and Chunking using NLTK
Short notes:
The Natural Language Toolkit (NLTK) is a platform used for building programs for text analysis. One of the more powerful aspects of the
NLTK module is the Part of Speech tagging.
Part-of-speech (POS) tagging is a process of converting a sentence to forms – list of words, list of tuples (where each tuple is having
a form (word, tag)). The tag in case of is a part-of-speech tag, and signifies whether the word is a noun, adjective, verb, and so on.
keywords:
Token : Each “entity” that is a part of whatever was split up based on rules.
CD cardinal digit
foreign word
IN preposition/subordinating conjunction JJ
adjective ‘big’
N-grams are continuous sequences of words or symbols or tokens in a document. In technical terms, they can be defined as the
neighbouring sequences of items in a document.
Steps for n-gram model:
Explore the dataset Feature
extraction
https://colab.research.google.com/drive/1I48Pi70Hlku8oHYmpnMbed1Mx8mAOBJ2?usp=sharing 1/
3/7/22, 8:15 PM CO5 Natural Language Toolkit SREEJITH T SHAJI
Train-test split
Basic pre-processing
Creating unigrams
Creating bigrams
Creating trigrams
1 print(sw)
{'than', "wouldn't", 'having', "you'll", 'during', 'his', 'she', 'in', 'most', 'ours', 'how', "hasn't", 'sho
1 print(tokenized)
https://colab.research.google.com/drive/1I48Pi70Hlku8oHYmpnMbed1Mx8mAOBJ2?usp=sharing 2/
3/7/22, 8:15 PM CO5 Natural Language Toolkit SREEJITH T SHAJI
N-gram model
1 import numpy as np 2
import pandas as pd
3 import matplotlib.pyplot as plt 4
plt.style.use(style='seaborn')
5 #get the data from https://www.kaggle.com/ankurzing/sentiment-analysis-for-financial-news/version/5 6
colnames=['sentiment','news']
7 df=pd.read_csv('all-data.csv - all-data.csv.csv',encoding='ISO-8859-1',names=colnames) 8
df.head()
sentiment news
1 df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 2 columns):
# Column Non-Null Count Dtype
1 df['sentiment'].value_counts()
neutral 2879
positive 1363
negative 604
Name: sentiment, dtype: int64
1 y=df['sentiment'].values
2 y.shape
(4846,)
1 x=df['news'].values
2 x.shape
(4846,)
(3392,)
(1454,)
(3392,)
(1454,)
https://colab.research.google.com/drive/1I48Pi70Hlku8oHYmpnMbed1Mx8mAOBJ2?usp=sharing 3/
3/7/22, 8:15 PM CO5 Natural Language Toolkit SREEJITH T SHAJI
1 #removing punctuations
2 #library that contains punctuation 3
import string
4 string.punctuation
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
1 text='darsana*@123'
2 remove_pun(text)
'darsana123'
1 #storing the puntuation free text in a new column called clean_msg 2 df_train['news']=df_train['news'].apply(remove_pun)
3 df_test['news']=df_test['news'].apply(remove_pun)
4 #punctuations are removed from news column in train dataset 5
df_train.head()
news sentiment
1 import nltk
2 from nltk.corpus import stopwords 3
nltk.download('stopwords')
https://colab.research.google.com/drive/1I48Pi70Hlku8oHYmpnMbed1Mx8mAOBJ2?usp=sharing 4/
3/7/22, 8:15 PM CO5 Natural Language Toolkit SREEJITH T SHAJI
1 #method to generate n-grams:
2 #params:
3 #text-the text for which we have to generate n-grams
4 #ngram-number of grams to be generated from the text(1,2,3,4 etc., default value=1) 5 def
generate_N_grams(text,ngram=1):
6 words=[word for word in text.split(" ")if word not in set(stopwords.words('english'))]
7 print("After removing stopwords:",words)
8 temp=zip(*[words[i:] for i in range(0,ngram)])
9 ans=[' '.join(ngram) for ngram in temp]
10 return ans
1 name=['sajil','lidhan']
2 s=' '.join(name)
3 s
1 s1=['abeel','adhitya','akash','akshaya']
2 s2=[1,2,3,4]
3 s3=zip(s1,s2)
4 print(set(s3))
https://colab.research.google.com/drive/1I48Pi70Hlku8oHYmpnMbed1Mx8mAOBJ2?usp=sharing 5/