0% found this document useful (0 votes)
22 views14 pages

Cognizant's Artificial Intelligence Task 1

Uploaded by

Nikhil
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
22 views14 pages

Cognizant's Artificial Intelligence Task 1

Uploaded by

Nikhil
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 14

import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Section 2 - Data loading Now that Google Drive is mounted, you can store the CSV file
anywhere in your Drive and update the path variable below to access it within this notebook.
Once we've updated the path, let's read this CSV file into a pandas dataframe and see what it
looks like

path = "/content/drive/MyDrive/sample_sales_data.csv"
df = pd.read_csv(path)
df.drop(columns=["Unnamed: 0"], inplace=True, errors='ignore')
df.head(5)

transaction_id timestamp \
0 a1c82654-c52c-45b3-8ce8-4c2a1efe63ed 2022-03-02 09:51:38
1 931ad550-09e8-4da6-beaa-8c9d17be9c60 2022-03-06 10:33:59
2 ae133534-6f61-4cd6-b6b8-d1c1d8d90aea 2022-03-04 17:20:21
3 157cebd9-aaf0-475d-8a11-7c8e0f5b76e4 2022-03-02 17:23:58
4 a81a6cd3-5e0c-44a2-826c-aea43e46c514 2022-03-05 14:32:43

product_id category customer_type


unit_price \
0 3bc6c1ea-0198-46de-9ffd-514ae3338713 fruit gold
3.99
1 ad81b46c-bf38-41cf-9b54-5fe7f5eba93e fruit standard
3.99
2 7c55cbd4-f306-4c04-a030-628cbe7867c1 fruit premium
0.19
3 80da8348-1707-403f-8be7-9e6deeccc883 fruit gold
0.19
4 7f5e86e6-f06f-45f6-bf44-27b095c9ad1d fruit basic
4.49

quantity total payment_type


0 2 7.98 e-wallet
1 1 3.99 e-wallet
2 2 0.38 e-wallet
3 4 0.76 e-wallet
4 2 8.98 debit card

<google.colab._quickchart_helpers.SectionTitle at 0x79b2b2a47e20>

from matplotlib import pyplot as plt


_df_0['index'].plot(kind='hist', bins=20, title='index')
plt.gca().spines[['top', 'right',]].set_visible(False)
from matplotlib import pyplot as plt
_df_1['unit_price'].plot(kind='hist', bins=20, title='unit_price')
plt.gca().spines[['top', 'right',]].set_visible(False)

from matplotlib import pyplot as plt


_df_2['quantity'].plot(kind='hist', bins=20, title='quantity')
plt.gca().spines[['top', 'right',]].set_visible(False)

from matplotlib import pyplot as plt


_df_3['total'].plot(kind='hist', bins=20, title='total')
plt.gca().spines[['top', 'right',]].set_visible(False)

<google.colab._quickchart_helpers.SectionTitle at 0x79b2b2a47550>

from matplotlib import pyplot as plt


import seaborn as sns
_df_4.groupby('transaction_id').size().plot(kind='barh',
color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

from matplotlib import pyplot as plt


import seaborn as sns
_df_5.groupby('timestamp').size().plot(kind='barh',
color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

from matplotlib import pyplot as plt


import seaborn as sns
_df_6.groupby('product_id').size().plot(kind='barh',
color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

from matplotlib import pyplot as plt


import seaborn as sns
_df_7.groupby('customer_type').size().plot(kind='barh',
color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

<google.colab._quickchart_helpers.SectionTitle at 0x79b2b2a47ee0>

from matplotlib import pyplot as plt


_df_8.plot(kind='scatter', x='index', y='unit_price', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

from matplotlib import pyplot as plt


_df_9.plot(kind='scatter', x='unit_price', y='quantity', s=32,
alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

from matplotlib import pyplot as plt


_df_10.plot(kind='scatter', x='quantity', y='total', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)
<google.colab._quickchart_helpers.SectionTitle at 0x79b2b2a8d000>

from matplotlib import pyplot as plt


import seaborn as sns
def _plot_series(series, series_name, series_index=0):
from matplotlib import pyplot as plt
import seaborn as sns
palette = list(sns.palettes.mpl_palette('Dark2'))
xs = series['index']
ys = series['unit_price']

plt.plot(xs, ys, label=series_name, color=palette[series_index %


len(palette)])

fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')


df_sorted = _df_11.sort_values('index', ascending=True)
for i, (series_name, series) in
enumerate(df_sorted.groupby('transaction_id')):
_plot_series(series, series_name, i)
fig.legend(title='transaction_id', bbox_to_anchor=(1, 1), loc='upper
left')
sns.despine(fig=fig, ax=ax)
plt.xlabel('index')
_ = plt.ylabel('unit_price')

from matplotlib import pyplot as plt


import seaborn as sns
def _plot_series(series, series_name, series_index=0):
from matplotlib import pyplot as plt
import seaborn as sns
palette = list(sns.palettes.mpl_palette('Dark2'))
xs = series['index']
ys = series['unit_price']

plt.plot(xs, ys, label=series_name, color=palette[series_index %


len(palette)])

fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')


df_sorted = _df_12.sort_values('index', ascending=True)
for i, (series_name, series) in
enumerate(df_sorted.groupby('timestamp')):
_plot_series(series, series_name, i)
fig.legend(title='timestamp', bbox_to_anchor=(1, 1), loc='upper
left')
sns.despine(fig=fig, ax=ax)
plt.xlabel('index')
_ = plt.ylabel('unit_price')

from matplotlib import pyplot as plt


import seaborn as sns
def _plot_series(series, series_name, series_index=0):
from matplotlib import pyplot as plt
import seaborn as sns
palette = list(sns.palettes.mpl_palette('Dark2'))
xs = series['index']
ys = series['unit_price']

plt.plot(xs, ys, label=series_name, color=palette[series_index %


len(palette)])

fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')


df_sorted = _df_13.sort_values('index', ascending=True)
for i, (series_name, series) in
enumerate(df_sorted.groupby('product_id')):
_plot_series(series, series_name, i)
fig.legend(title='product_id', bbox_to_anchor=(1, 1), loc='upper
left')
sns.despine(fig=fig, ax=ax)
plt.xlabel('index')
_ = plt.ylabel('unit_price')

from matplotlib import pyplot as plt


import seaborn as sns
def _plot_series(series, series_name, series_index=0):
from matplotlib import pyplot as plt
import seaborn as sns
palette = list(sns.palettes.mpl_palette('Dark2'))
xs = series['index']
ys = series['unit_price']

plt.plot(xs, ys, label=series_name, color=palette[series_index %


len(palette)])

fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')


df_sorted = _df_14.sort_values('index', ascending=True)
for i, (series_name, series) in
enumerate(df_sorted.groupby('customer_type')):
_plot_series(series, series_name, i)
fig.legend(title='customer_type', bbox_to_anchor=(1, 1), loc='upper
left')
sns.despine(fig=fig, ax=ax)
plt.xlabel('index')
_ = plt.ylabel('unit_price')

<google.colab._quickchart_helpers.SectionTitle at 0x79b2b2a8d060>

from matplotlib import pyplot as plt


_df_15['index'].plot(kind='line', figsize=(8, 4), title='index')
plt.gca().spines[['top', 'right']].set_visible(False)
from matplotlib import pyplot as plt
_df_16['unit_price'].plot(kind='line', figsize=(8, 4),
title='unit_price')
plt.gca().spines[['top', 'right']].set_visible(False)

from matplotlib import pyplot as plt


_df_17['quantity'].plot(kind='line', figsize=(8, 4), title='quantity')
plt.gca().spines[['top', 'right']].set_visible(False)

from matplotlib import pyplot as plt


_df_18['total'].plot(kind='line', figsize=(8, 4), title='total')
plt.gca().spines[['top', 'right']].set_visible(False)

<google.colab._quickchart_helpers.SectionTitle at 0x79b2b2a47df0>

from matplotlib import pyplot as plt


import seaborn as sns
import pandas as pd
plt.subplots(figsize=(8, 8))
df_2dhist = pd.DataFrame({
x_label: grp['timestamp'].value_counts()
for x_label, grp in _df_19.groupby('transaction_id')
})
sns.heatmap(df_2dhist, cmap='viridis')
plt.xlabel('transaction_id')
_ = plt.ylabel('timestamp')

from matplotlib import pyplot as plt


import seaborn as sns
import pandas as pd
plt.subplots(figsize=(8, 8))
df_2dhist = pd.DataFrame({
x_label: grp['product_id'].value_counts()
for x_label, grp in _df_20.groupby('timestamp')
})
sns.heatmap(df_2dhist, cmap='viridis')
plt.xlabel('timestamp')
_ = plt.ylabel('product_id')

from matplotlib import pyplot as plt


import seaborn as sns
import pandas as pd
plt.subplots(figsize=(8, 8))
df_2dhist = pd.DataFrame({
x_label: grp['customer_type'].value_counts()
for x_label, grp in _df_21.groupby('product_id')
})
sns.heatmap(df_2dhist, cmap='viridis')
plt.xlabel('product_id')
_ = plt.ylabel('customer_type')
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
plt.subplots(figsize=(8, 8))
df_2dhist = pd.DataFrame({
x_label: grp['payment_type'].value_counts()
for x_label, grp in _df_22.groupby('customer_type')
})
sns.heatmap(df_2dhist, cmap='viridis')
plt.xlabel('customer_type')
_ = plt.ylabel('payment_type')

<google.colab._quickchart_helpers.SectionTitle at 0x79b2b2a8d510>

from matplotlib import pyplot as plt


import seaborn as sns
figsize = (12, 1.2 * len(_df_23['transaction_id'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(_df_23, x='index', y='transaction_id', inner='stick',
palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)

from matplotlib import pyplot as plt


import seaborn as sns
figsize = (12, 1.2 * len(_df_24['timestamp'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(_df_24, x='index', y='timestamp', inner='stick',
palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)

from matplotlib import pyplot as plt


import seaborn as sns
figsize = (12, 1.2 * len(_df_25['product_id'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(_df_25, x='index', y='product_id', inner='stick',
palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)

from matplotlib import pyplot as plt


import seaborn as sns
figsize = (12, 1.2 * len(_df_26['customer_type'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(_df_26, x='index', y='customer_type', inner='stick',
palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)

from matplotlib import pyplot as plt


import seaborn as sns
_df_7.groupby('customer_type').size().plot(kind='barh',
color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)
from matplotlib import pyplot as plt
import seaborn as sns
_df_4.groupby('transaction_id').size().plot(kind='barh',
color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)
#Section 3 - Descriptive statistics In this section, you should try to gain a description of the data,
that is: what columns are present, how many null values exist and what data types exists within
each column.

To get you started an explanation of what the column names mean are provided below:

transaction_id = this is a unique ID that is assigned to each transaction timestamp = this is the
datetime at which the transaction was made product_id = this is an ID that is assigned to the
product that was sold. Each product has a unique ID category = this is the category that the
product is contained within customer_type = this is the type of customer that made the
transaction unit_price = the price that 1 unit of this item sells for quantity = the number of units
sold for this product within this transaction total = the total amount payable by the customer
payment_type = the payment method used by the customer After this, you should try to
compute some descriptive statistics of the numerical columns within the dataset, such as:

mean median count

#Statistical properties of the dataset

Data types:

df.dtypes

transaction_id object
timestamp object
product_id object
category object
customer_type object
unit_price float64
quantity int64
total float64
payment_type object
dtype: object

#Missing values:

df.isna().sum()

transaction_id 0
timestamp 0
product_id 0
category 0
customer_type 0
unit_price 0
quantity 0
total 0
payment_type 0
dtype: int64

df.describe()
unit_price quantity total
count 7829.000000 7829.000000 7829.000000
mean 7.819480 2.501597 19.709905
std 5.388088 1.122722 17.446680
min 0.190000 1.000000 0.190000
25% 3.990000 1.000000 6.570000
50% 7.190000 3.000000 14.970000
75% 11.190000 4.000000 28.470000
max 23.990000 4.000000 95.960000

#Data visualization

Category distribution

import seaborn as sns

sns.countplot(x='product_id', hue='category', data=df)


plt.title('Product ID Distribution by Category')
plt.xlabel('Product ID')
plt.ylabel('Count')
plt.show()
plt.figure(figsize=(10, 6))
sns.histplot(df['product_id'])
plt.title('Product ID Distribution')
plt.xlabel('Product ID')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(df['quantity'])
plt.title('Quantity Distribution')
plt.xlabel('Quantity')
plt.ylabel('Count')
plt.show()
plt.figure(figsize=(10, 6))
sns.histplot(df['unit_price'])
plt.title('Unit Price Distribution')
plt.xlabel('Unit Price')
plt.ylabel('Count')
plt.show()
plt.figure(figsize=(10, 6))
sns.histplot(df['total'])
plt.title('Total Price Distribution')
plt.xlabel('Total Price')
plt.ylabel('Count')
plt.show()
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True)
plt.title('Correlation Matrix')
plt.show()

<ipython-input-14-4142aee218c4>:2: FutureWarning: The default value of


numeric_only in DataFrame.corr is deprecated. In a future version, it
will default to False. Select only valid columns or specify the value
of numeric_only to silence this warning.
sns.heatmap(df.corr(), annot=True)
#Summary We have completed an initial exploratory data analysis on the sample of data
provided. We should now have a solid understanding of the data.

The client wants to know

"How to better stock the items that they sell" From this dataset, it is impossible to answer that
question. In order to make the next step on this project with the client, it is clear that:

We need more rows of data. The current sample is only from 1 store and 1 week worth of data
We need to frame the specific problem statement that we want to solve. The current business
problem is too broad, we should narrow down the focus in order to deliver a valuable end
product We need more features. Based on the problem statement that we move forward with,
we need more columns (features) that may help us to understand the outcome that we're
solving for

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy