ML Practical 1
ML Practical 1
In [1]:
1 import pandas as pd
2 import numpy as np
3 import seaborn as sns
4 import matplotlib.pyplot as plt
In [4]:
1 df = pd.read_csv("uber.csv")
2 df
Out[4]:
Unnamed:
key fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passen
0
2015-05-07 2015-05-07
0 24238194 7.5 -73.999817 40.738354 -73.999512 40.723217
19:52:06.0000003 19:52:06 UTC
2009-07-17 2009-07-17
1 27835199 7.7 -73.994355 40.728225 -73.994710 40.750325
20:04:56.0000002 20:04:56 UTC
2009-08-24 2009-08-24
2 44984355 12.9 -74.005043 40.740770 -73.962565 40.772647
21:45:00.00000061 21:45:00 UTC
2009-06-26 2009-06-26
3 25894730 5.3 -73.976124 40.790844 -73.965316 40.803349
08:22:21.0000001 08:22:21 UTC
2014-08-28 2014-08-28
4 17610152 16.0 -73.925023 40.744085 -73.973082 40.761247
17:47:00.000000188 17:47:00 UTC
2012-10-28 2012-10-28
199995 42598914 3.0 -73.987042 40.739367 -73.986525 40.740297
10:49:00.00000053 10:49:00 UTC
2014-03-14 2014-03-14
199996 16382965 7.5 -73.984722 40.736837 -74.006672 40.739620
01:09:00.0000008 01:09:00 UTC
2009-06-29 2009-06-29
199997 27804658 30.9 -73.986017 40.756487 -73.858957 40.692588
00:42:00.00000078 00:42:00 UTC
2015-05-20 2015-05-20
199998 20259894 14.5 -73.997124 40.725452 -73.983215 40.695415
14:56:25.0000004 14:56:25 UTC
2010-05-15 2010-05-15
199999 11951496 14.1 -73.984395 40.720077 -73.985508 40.768793
04:08:00.00000076 04:08:00 UTC
In [5]:
1 df.head()
Out[5]:
Unnamed:
key fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_co
0
2015-05-07 2015-05-07
0 24238194 7.5 -73.999817 40.738354 -73.999512 40.723217
19:52:06.0000003 19:52:06 UTC
2009-07-17 2009-07-17
1 27835199 7.7 -73.994355 40.728225 -73.994710 40.750325
20:04:56.0000002 20:04:56 UTC
2009-08-24 2009-08-24
2 44984355 12.9 -74.005043 40.740770 -73.962565 40.772647
21:45:00.00000061 21:45:00 UTC
2009-06-26 2009-06-26
3 25894730 5.3 -73.976124 40.790844 -73.965316 40.803349
08:22:21.0000001 08:22:21 UTC
2014-08-28 2014-08-28
4 17610152 16.0 -73.925023 40.744085 -73.973082 40.761247
17:47:00.000000188 17:47:00 UTC
localhost:8888/notebooks/Untitled.ipynb 1/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [6]:
1 df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 200000 non-null int64
1 key 200000 non-null object
2 fare_amount 200000 non-null float64
3 pickup_datetime 200000 non-null object
4 pickup_longitude 200000 non-null float64
5 pickup_latitude 200000 non-null float64
6 dropoff_longitude 199999 non-null float64
7 dropoff_latitude 199999 non-null float64
8 passenger_count 200000 non-null int64
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB
In [7]:
1 df.columns
Out[7]:
In [8]:
In [9]:
1 df.head()
Out[9]:
In [10]:
1 df.shape
Out[10]:
(200000, 7)
In [11]:
1 df.dtypes
Out[11]:
fare_amount float64
pickup_datetime object
pickup_longitude float64
pickup_latitude float64
dropoff_longitude float64
dropoff_latitude float64
passenger_count int64
dtype: object
localhost:8888/notebooks/Untitled.ipynb 2/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [12]:
1 df.describe()
Out[12]:
In [13]:
1 df.isnull().sum()
Out[13]:
fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 1
dropoff_latitude 1
passenger_count 0
dtype: int64
In [14]:
1 df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(), inplace=True)
In [15]:
1 df.isnull().sum()
Out[15]:
fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 1
dropoff_latitude 0
passenger_count 0
dtype: int64
In [16]:
1 df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(), inplace=True)
In [17]:
1 df.isnull().sum()
Out[17]:
fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dtype: int64
localhost:8888/notebooks/Untitled.ipynb 3/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [18]:
1 df.dtypes
Out[18]:
fare_amount float64
pickup_datetime object
pickup_longitude float64
pickup_latitude float64
dropoff_longitude float64
dropoff_latitude float64
passenger_count int64
dtype: object
In [19]:
In [20]:
1 df.dtypes
Out[20]:
fare_amount float64
pickup_datetime datetime64[ns, UTC]
pickup_longitude float64
pickup_latitude float64
dropoff_longitude float64
dropoff_latitude float64
passenger_count int64
dtype: object
In [21]:
localhost:8888/notebooks/Untitled.ipynb 4/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [22]:
1 df.head
Out[22]:
localhost:8888/notebooks/Untitled.ipynb 5/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [41]:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_19836\3060153060.py in <module>
----> 1 df = df.drop('pickup_datetime', axis =1)
In [ ]:
1 df.head
In [25]:
1 df.dtypes
Out[25]:
fare_amount float64
pickup_longitude float64
pickup_latitude float64
dropoff_longitude float64
dropoff_latitude float64
passenger_count int64
hour int64
day int64
month int64
year int64
dayofweek int64
dtype: object
localhost:8888/notebooks/Untitled.ipynb 6/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [26]:
Out[26]:
fare_amount AxesSubplot(0.125,0.786098;0.352273x0.0939024)
pickup_longitude AxesSubplot(0.547727,0.786098;0.352273x0.0939024)
pickup_latitude AxesSubplot(0.125,0.673415;0.352273x0.0939024)
dropoff_longitude AxesSubplot(0.547727,0.673415;0.352273x0.0939024)
dropoff_latitude AxesSubplot(0.125,0.560732;0.352273x0.0939024)
passenger_count AxesSubplot(0.547727,0.560732;0.352273x0.0939024)
hour AxesSubplot(0.125,0.448049;0.352273x0.0939024)
day AxesSubplot(0.547727,0.448049;0.352273x0.0939024)
month AxesSubplot(0.125,0.335366;0.352273x0.0939024)
year AxesSubplot(0.547727,0.335366;0.352273x0.0939024)
dayofweek AxesSubplot(0.125,0.222683;0.352273x0.0939024)
dtype: object
localhost:8888/notebooks/Untitled.ipynb 7/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [27]:
In [28]:
1 df = treat_outliers_all(df,df.iloc[:, 0::])
localhost:8888/notebooks/Untitled.ipynb 8/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [29]:
Out[29]:
fare_amount AxesSubplot(0.125,0.786098;0.352273x0.0939024)
pickup_longitude AxesSubplot(0.547727,0.786098;0.352273x0.0939024)
pickup_latitude AxesSubplot(0.125,0.673415;0.352273x0.0939024)
dropoff_longitude AxesSubplot(0.547727,0.673415;0.352273x0.0939024)
dropoff_latitude AxesSubplot(0.125,0.560732;0.352273x0.0939024)
passenger_count AxesSubplot(0.547727,0.560732;0.352273x0.0939024)
hour AxesSubplot(0.125,0.448049;0.352273x0.0939024)
day AxesSubplot(0.547727,0.448049;0.352273x0.0939024)
month AxesSubplot(0.125,0.335366;0.352273x0.0939024)
year AxesSubplot(0.547727,0.335366;0.352273x0.0939024)
dayofweek AxesSubplot(0.125,0.222683;0.352273x0.0939024)
dtype: object
In [ ]:
localhost:8888/notebooks/Untitled.ipynb 9/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [42]:
1 import haversine as hs
2 travel_dist = []
3 for pos in range(len(df['pickup_longitude'])):
4 long1,lati1,long2,lati2 = [df['pickup_longitude'][pos],df['pickup_latitude'][pos],df['dropoff_longitude'][pos],df['dropoff_l
5 loc1 =(lati1,long1)
6 loc2 =(lati2,long2)
7 c = hs.haversine(loc1,loc2)
8 travel_dist.append(c)
9
10 print(travel_dist)
11 df['dist_travel_km']= travel_dist
12 df.head
Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)
Out[42]:
dist_travel_km
0 12758.286154
1 12756.625962
2 12754.446942
3 12760.320445
4 12755.985008
... ...
199995 12756.954766
199996 12758.913797
199997 12751.688525
199998 12755.039558
199999 12754.696728
In [43]:
1 df = df.loc[(df.dist_travel_km>=1) |(df.dist_travel_km<=130) ]
2 print("Remaining obervation:" , df.shape)
localhost:8888/notebooks/Untitled.ipynb 10/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [44]:
In [45]:
In [46]:
1 df.head()
Out[46]:
fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count hour day month year dayofweek dist_t
In [47]:
1 df.isnull().sum()
Out[47]:
fare_amount 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
hour 0
day 0
month 0
year 0
dayofweek 0
dist_travel_km 0
dtype: int64
localhost:8888/notebooks/Untitled.ipynb 11/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [48]:
1 sns.heatmap(df.isnull())
Out[48]:
<AxesSubplot:>
In [49]:
1 corr = df.corr()
In [50]:
1 corr
Out[50]:
fare_amount 1.000000 0.154069 -0.110842 0.218675 -0.125898 0.015778 -0.023623 0.004534 0.030
pickup_longitude 0.154069 1.000000 0.259497 0.425619 0.073290 -0.013213 0.011579 -0.003204 0.001
pickup_latitude -0.110842 0.259497 1.000000 0.048889 0.515714 -0.012889 0.029681 -0.001553 0.001
dropoff_longitude 0.218675 0.425619 0.048889 1.000000 0.245667 -0.009303 -0.046558 -0.004007 0.002
dropoff_latitude -0.125898 0.073290 0.515714 0.245667 1.000000 -0.006308 0.019783 -0.003479 -0.001
passenger_count 0.015778 -0.013213 -0.012889 -0.009303 -0.006308 1.000000 0.020274 0.002712 0.010
hour -0.023623 0.011579 0.029681 -0.046558 0.019783 0.020274 1.000000 0.004677 -0.003
day 0.004534 -0.003204 -0.001553 -0.004007 -0.003479 0.002712 0.004677 1.000000 -0.017
month 0.030817 0.001169 0.001562 0.002391 -0.001193 0.010351 -0.003926 -0.017360 1.000
year 0.141277 0.010198 -0.014243 0.011346 -0.009603 -0.009749 0.002156 -0.012170 -0.115
dayofweek 0.013652 -0.024652 -0.042310 -0.003336 -0.031919 0.048550 -0.086947 0.005617 -0.008
dist_travel_km -0.233982 -0.091832 0.731839 -0.644884 0.227001 -0.003515 0.054475 0.001546 -0.000
localhost:8888/notebooks/Untitled.ipynb 12/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [100]:
Out[100]:
<Axes: >
In [51]:
1 df.dtypes
Out[51]:
fare_amount float64
pickup_longitude float64
pickup_latitude float64
dropoff_longitude float64
dropoff_latitude float64
passenger_count float64
hour int64
day int64
month int64
year int64
dayofweek int64
dist_travel_km float64
dtype: object
In [59]:
In [60]:
1 y = df['fare_amount']
In [61]:
localhost:8888/notebooks/Untitled.ipynb 13/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [62]:
In [63]:
1 regression.fit(x_train,y_train)
Out[63]:
LinearRegression()
In [65]:
1 regression.intercept_
Out[65]:
2810.586867670241
In [66]:
1 regression.coef_
Out[66]:
In [67]:
1 prediction = regression.predict(x_test)
In [68]:
1 print(prediction)
In [69]:
1 y_test
Out[69]:
168553 22.25
24295 12.50
54374 3.70
77398 16.00
80636 4.00
...
114623 6.10
194330 14.50
52536 4.50
136642 12.00
84566 6.00
Name: fare_amount, Length: 66000, dtype: float64
In [72]:
In [73]:
1 r2_score(y_test,prediction)
Out[73]:
0.41928436137240455
In [74]:
In [75]:
1 MSE = mean_squared_error(y_test,prediction)
localhost:8888/notebooks/Untitled.ipynb 14/15
7/19/23, 7:25 PM Untitled - Jupyter Notebook
In [76]:
1 MSE
Out[76]:
16.961152324796327
In [ ]:
localhost:8888/notebooks/Untitled.ipynb 15/15