0% found this document useful (0 votes)
8 views6 pages

Cleaning Data

The document contains a Jupyter notebook that processes a sales dataset using pandas. It includes reading a CSV file, checking data types, cleaning the 'Revenue' column, and performing descriptive statistics. Additionally, it attempts to visualize data but encounters an error due to an undefined library.

Uploaded by

juanfe86
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views6 pages

Cleaning Data

The document contains a Jupyter notebook that processes a sales dataset using pandas. It includes reading a CSV file, checking data types, cleaning the 'Revenue' column, and performing descriptive statistics. Additionally, it attempts to visualize data but encounters an error due to an undefined library.

Uploaded by

juanfe86
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 6

{

"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"sales = pd.read_csv('C:\\\\Users\\\\juanf\\\\OneDrive\\\\Escritorio\\\\
sales.csv')\n",
"sales.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"SalesOrderID int64\n",
"Revenue object\n",
"Quantity int64\n",
"dtype: object"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 2 entries, 0 to 1\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 SalesOrderID 2 non-null int64 \n",
" 1 Revenue 2 non-null object\n",
" 2 Quantity 2 non-null int64 \n",
"dtypes: int64(2), object(1)\n",
"memory usage: 180.0+ bytes\n"
]
}
],
"source": [
"sales.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'23153$1457$'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales['Revenue'].sum()\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"sales['Revenue']=sales['Revenue'].str.strip('$')\n",
"sales['Revenue']=sales['Revenue'].astype('int')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"assert sales['Revenue'].dtype == 'int'\n",
"assert 1+1==2"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SalesOrderID</th>\n",
" <th>Revenue</th>\n",
" <th>Quantity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>2.000000</td>\n",
" <td>2.000000</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>43659.500000</td>\n",
" <td>12305.000000</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.707107</td>\n",
" <td>15341.388725</td>\n",
" <td>7.071068</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>43659.000000</td>\n",
" <td>1457.000000</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>43659.250000</td>\n",
" <td>6881.000000</td>\n",
" <td>4.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>43659.500000</td>\n",
" <td>12305.000000</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>43659.750000</td>\n",
" <td>17729.000000</td>\n",
" <td>9.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>43660.000000</td>\n",
" <td>23153.000000</td>\n",
" <td>12.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" SalesOrderID Revenue Quantity\n",
"count 2.000000 2.000000 2.000000\n",
"mean 43659.500000 12305.000000 7.000000\n",
"std 0.707107 15341.388725 7.071068\n",
"min 43659.000000 1457.000000 2.000000\n",
"25% 43659.250000 6881.000000 4.500000\n",
"50% 43659.500000 12305.000000 7.000000\n",
"75% 43659.750000 17729.000000 9.500000\n",
"max 43660.000000 23153.000000 12.000000"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales.describe()\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'plt' is not defined",
"output_type": "error",
"traceback": [
"\
u001b[1;31m------------------------------------------------------------------------
---\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback
(most recent call last)",
"Cell \u001b[1;32mIn[17], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \
u001b[43mplt\u001b[49m\u001b[38;5;241m.\u001b[39mhist(movies[\u001b[38;5;124m'\
u001b[39m\u001b[38;5;124mavg_rating\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\
u001b[0;32m 2\u001b[0m plt\u001b[38;5;241m.\u001b[39mtitle(\u001b[38;5;124m'\
u001b[39m\u001b[38;5;124mwhatever\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
"\u001b[1;31mNameError\u001b[0m: name 'plt' is not defined"
]
}
],
"source": [
"plt.hist(movies['avg_rating'])\n",
"plt.title('whatever')\n",
"> dt.date.today()\n",
"movies.drop(movies[movies['avg_rating']>5].index, inplace = True)\n",
"assert movies['avg_rating'].max()<=5\n",
"movies.loc[movies['avg_rating']>5,'avg_rating']=5\n",
"user_signups['subscription_date'] =
pd.to_datetime(user_signups['subscription_date']).dt.date\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Uniqueness constraints (chapter)\n",
"duplicates = height_weight.duplicated()\n",
"height_weight[duplicates]\n",
"\n",
"column_names = ['first_name','last_name','address']\n",
"duplicates = height_weight.duplicated(subset = column_names, keep = False)\n",
"height_weight[duplicates].sort_values(by = 'first_name')\n",
"\n",
"height_weight.drop_duplicates(inplace = True)\n",
"\n",
"column_names = ['first_name','last_name','address']\n",
"summaries = {'height':'max','weight':'mean'}\n",
"height_weight = height_weight.groupby(by =
column_names).agg(summaries).reset_index()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Origin",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "8ab9798c9dbdc377a44780d03c1df422753fd5bdb3f454154c68fcd412ec471b"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy