Notes_Uber_Data_analysis_project
Notes_Uber_Data_analysis_project
In [6]: dataset
West
01-06-2016 01-06-
4 Business Fort Pierce Palm 63.7 Customer V
14:42 2016 15:49
Beach
12/31/2016 12/31/2016
1153 Business Katunayake Gampaha 6.4 Temporary S
21:32 21:50
12/31/2016 12/31/2016
1154 Business Gampaha Ilukwatta 48.2 Temporary S
22:08 23:51
In [8]: dataset.shape
Out[8]: (1156, 7)
In [10]: dataset.info()
file:///C:/Users/swati/Downloads/Untitled.html 1/11
11/21/24, 4:44 PM Untitled
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156 entries, 0 to 1155
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 START_DATE 1156 non-null object
1 END_DATE 1155 non-null object
2 CATEGORY 1155 non-null object
3 START 1155 non-null object
4 STOP 1155 non-null object
5 MILES 1156 non-null float64
6 PURPOSE 653 non-null object
dtypes: float64(1), object(6)
memory usage: 63.3+ KB
Data Preprocessing
In [15]: dataset['PURPOSE'].fillna("NOT", inplace = True)
C:\Users\swati\AppData\Local\Temp\ipykernel_31136\4083644620.py:1: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained as
signment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work becau
se the intermediate object on which we are setting values always behaves as a cop
y.
In [17]: dataset.head()
West
01-06-2016 01-06-2016 Fort
4 Business Palm 63.7 Customer Visit
14:42 15:49 Pierce
Beach
file:///C:/Users/swati/Downloads/Untitled.html 2/11
11/21/24, 4:44 PM Untitled
In [21]: dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156 entries, 0 to 1155
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 START_DATE 421 non-null datetime64[ns]
1 END_DATE 420 non-null datetime64[ns]
2 CATEGORY 1155 non-null object
3 START 1155 non-null object
4 STOP 1155 non-null object
5 MILES 1156 non-null float64
6 PURPOSE 1156 non-null object
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 63.3+ KB
dataset['date'] = pd.DatetimeIndex(dataset['START_DATE']).date
dataset['time'] = pd.DatetimeIndex(dataset['START_DATE']).hour
In [25]: dataset.head()
2016-01-
2016-01-01 Fort Fort 2016-
0 01 Business 5.1 Meal/Entertain
21:11:00 Pierce Pierce 01-01
21:17:00
2016-01-
2016-01-02 Fort Fort 2016-
1 02 Business 5.0 NOT
01:25:00 Pierce Pierce 01-02
01:37:00
2016-01-
2016-01-02 Fort Fort 2016-
2 02 Business 4.8 Errand/Supplies
20:25:00 Pierce Pierce 01-02
20:38:00
2016-01-
2016-01-05 Fort Fort 2016-
3 05 Business 4.7 Meeting
17:31:00 Pierce Pierce 01-05
17:45:00
2016-01- West
2016-01-06 Fort 2016-
4 06 Business Palm 63.7 Customer Visit
14:42:00 Pierce 01-06
15:49:00 Beach
In [29]: dataset.head()
file:///C:/Users/swati/Downloads/Untitled.html 3/11
11/21/24, 4:44 PM Untitled
Out[29]:
START_DATE END_DATE CATEGORY START STOP MILES PURPOSE date t
2016-01-
2016-01-01 Fort Fort 2016-
0 01 Business 5.1 Meal/Entertain
21:11:00 Pierce Pierce 01-01
21:17:00
2016-01-
2016-01-02 Fort Fort 2016-
1 02 Business 5.0 NOT
01:25:00 Pierce Pierce 01-02
01:37:00
2016-01-
2016-01-02 Fort Fort 2016-
2 02 Business 4.8 Errand/Supplies
20:25:00 Pierce Pierce 01-02
20:38:00
2016-01-
2016-01-05 Fort Fort 2016-
3 05 Business 4.7 Meeting
17:31:00 Pierce Pierce 01-05
17:45:00
2016-01- West
2016-01-06 Fort 2016-
4 06 Business Palm 63.7 Customer Visit
14:42:00 Pierce 01-06
15:49:00 Beach
In [35]: dataset.shape
Data Visualization
In [46]: plt.figure(figsize=(20,5))
plt.subplot(1,2,1)
sns.countplot(dataset['CATEGORY'])
plt.xticks(rotation =90)
plt.subplot(1,2,2)
sns.countplot(dataset['PURPOSE'])
In [48]: sns.countplot(dataset['day-night'])
file:///C:/Users/swati/Downloads/Untitled.html 4/11
11/21/24, 4:44 PM Untitled
In [50]: dataset.head()
Out[50]:
START_DATE END_DATE CATEGORY START STOP MILES PURPOSE date t
2016-01-
2016-01-01 Fort Fort 2016-
0 01 Business 5.1 Meal/Entertain
21:11:00 Pierce Pierce 01-01
21:17:00
2016-01-
2016-01-02 Fort Fort 2016-
1 02 Business 5.0 NOT
01:25:00 Pierce Pierce 01-02
01:37:00
2016-01-
2016-01-02 Fort Fort 2016-
2 02 Business 4.8 Errand/Supplies
20:25:00 Pierce Pierce 01-02
20:38:00
2016-01-
2016-01-05 Fort Fort 2016-
3 05 Business 4.7 Meeting
17:31:00 Pierce Pierce 01-05
17:45:00
2016-01- West
2016-01-06 Fort 2016-
4 06 Business Palm 63.7 Customer Visit
14:42:00 Pierce 01-06
15:49:00 Beach
file:///C:/Users/swati/Downloads/Untitled.html 5/11
11/21/24, 4:44 PM Untitled
In [54]: dataset.head()
Out[54]:
START_DATE END_DATE CATEGORY START STOP MILES PURPOSE date t
2016-01-
2016-01-01 Fort Fort 2016-
0 01 Business 5.1 Meal/Entertain
21:11:00 Pierce Pierce 01-01
21:17:00
2016-01-
2016-01-02 Fort Fort 2016-
1 02 Business 5.0 NOT
01:25:00 Pierce Pierce 01-02
01:37:00
2016-01-
2016-01-02 Fort Fort 2016-
2 02 Business 4.8 Errand/Supplies
20:25:00 Pierce Pierce 01-02
20:38:00
2016-01-
2016-01-05 Fort Fort 2016-
3 05 Business 4.7 Meeting
17:31:00 Pierce Pierce 01-05
17:45:00
2016-01- West
2016-01-06 Fort 2016-
4 06 Business Palm 63.7 Customer Visit
14:42:00 Pierce 01-06
15:49:00 Beach
In [58]: df = pd.DataFrame({
"MONTHS": mon.values, # Har month ka total count.
"VALUE COUNT": dataset.groupby('MONTH', sort=False)['MILES'].max() # Har mo
})
file:///C:/Users/swati/Downloads/Untitled.html 6/11
11/21/24, 4:44 PM Untitled
In [60]: dataset.head()
Out[60]:
START_DATE END_DATE CATEGORY START STOP MILES PURPOSE date t
2016-01-
2016-01-01 Fort Fort 2016-
0 01 Business 5.1 Meal/Entertain
21:11:00 Pierce Pierce 01-01
21:17:00
2016-01-
2016-01-02 Fort Fort 2016-
1 02 Business 5.0 NOT
01:25:00 Pierce Pierce 01-02
01:37:00
2016-01-
2016-01-02 Fort Fort 2016-
2 02 Business 4.8 Errand/Supplies
20:25:00 Pierce Pierce 01-02
20:38:00
2016-01-
2016-01-05 Fort Fort 2016-
3 05 Business 4.7 Meeting
17:31:00 Pierce Pierce 01-05
17:45:00
2016-01- West
2016-01-06 Fort 2016-
4 06 Business Palm 63.7 Customer Visit
14:42:00 Pierce 01-06
15:49:00 Beach
day_label = {
0: 'Mon', 1:'Tues', 2:'Wed', 3:'Thur',4:'Fri', 5:'Sat', 6:'Sun'}
dataset['DAY'] = dataset['DAY'].map(day_label)
file:///C:/Users/swati/Downloads/Untitled.html 7/11
11/21/24, 4:44 PM Untitled
In [66]: dataset.head()
Out[66]:
START_DATE END_DATE CATEGORY START STOP MILES PURPOSE date t
2016-01-
2016-01-01 Fort Fort 2016-
0 01 Business 5.1 Meal/Entertain
21:11:00 Pierce Pierce 01-01
21:17:00
2016-01-
2016-01-02 Fort Fort 2016-
1 02 Business 5.0 NOT
01:25:00 Pierce Pierce 01-02
01:37:00
2016-01-
2016-01-02 Fort Fort 2016-
2 02 Business 4.8 Errand/Supplies
20:25:00 Pierce Pierce 01-02
20:38:00
2016-01-
2016-01-05 Fort Fort 2016-
3 05 Business 4.7 Meeting
17:31:00 Pierce Pierce 01-05
17:45:00
2016-01- West
2016-01-06 Fort 2016-
4 06 Business Palm 63.7 Customer Visit
14:42:00 Pierce 01-06
15:49:00 Beach
sns.barplot(x=day_label.index, y= day_label)
plt.xlabel('DAY')
plt.ylabel('COUNT')
file:///C:/Users/swati/Downloads/Untitled.html 8/11
11/21/24, 4:44 PM Untitled
In [70]: dataset.head()
Out[70]:
START_DATE END_DATE CATEGORY START STOP MILES PURPOSE date t
2016-01-
2016-01-01 Fort Fort 2016-
0 01 Business 5.1 Meal/Entertain
21:11:00 Pierce Pierce 01-01
21:17:00
2016-01-
2016-01-02 Fort Fort 2016-
1 02 Business 5.0 NOT
01:25:00 Pierce Pierce 01-02
01:37:00
2016-01-
2016-01-02 Fort Fort 2016-
2 02 Business 4.8 Errand/Supplies
20:25:00 Pierce Pierce 01-02
20:38:00
2016-01-
2016-01-05 Fort Fort 2016-
3 05 Business 4.7 Meeting
17:31:00 Pierce Pierce 01-05
17:45:00
2016-01- West
2016-01-06 Fort 2016-
4 06 Business Palm 63.7 Customer Visit
14:42:00 Pierce 01-06
15:49:00 Beach
In [74]: sns.boxplot(dataset['MILES'])
In [78]: sns.boxplot(dataset[dataset['MILES']<100]['MILES'])
file:///C:/Users/swati/Downloads/Untitled.html 9/11
11/21/24, 4:44 PM Untitled
In [82]: sns.boxplot(dataset[dataset['MILES']<40]['MILES'])
In [86]: sns.distplot(dataset[dataset['MILES']<40]['MILES'])
file:///C:/Users/swati/Downloads/Untitled.html 10/11
11/21/24, 4:44 PM Untitled
C:\Users\swati\AppData\Local\Temp\ipykernel_31136\1678554178.py:1: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
sns.distplot(dataset[dataset['MILES']<40]['MILES'])
Out[86]: <Axes: xlabel='MILES', ylabel='Density'>
In [ ]:
file:///C:/Users/swati/Downloads/Untitled.html 11/11