Cleaning Data
Cleaning Data
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"sales = pd.read_csv('C:\\\\Users\\\\juanf\\\\OneDrive\\\\Escritorio\\\\
sales.csv')\n",
"sales.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"SalesOrderID int64\n",
"Revenue object\n",
"Quantity int64\n",
"dtype: object"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 2 entries, 0 to 1\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 SalesOrderID 2 non-null int64 \n",
" 1 Revenue 2 non-null object\n",
" 2 Quantity 2 non-null int64 \n",
"dtypes: int64(2), object(1)\n",
"memory usage: 180.0+ bytes\n"
]
}
],
"source": [
"sales.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'23153$1457$'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales['Revenue'].sum()\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"sales['Revenue']=sales['Revenue'].str.strip('$')\n",
"sales['Revenue']=sales['Revenue'].astype('int')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"assert sales['Revenue'].dtype == 'int'\n",
"assert 1+1==2"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SalesOrderID</th>\n",
" <th>Revenue</th>\n",
" <th>Quantity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>2.000000</td>\n",
" <td>2.000000</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>43659.500000</td>\n",
" <td>12305.000000</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.707107</td>\n",
" <td>15341.388725</td>\n",
" <td>7.071068</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>43659.000000</td>\n",
" <td>1457.000000</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>43659.250000</td>\n",
" <td>6881.000000</td>\n",
" <td>4.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>43659.500000</td>\n",
" <td>12305.000000</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>43659.750000</td>\n",
" <td>17729.000000</td>\n",
" <td>9.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>43660.000000</td>\n",
" <td>23153.000000</td>\n",
" <td>12.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" SalesOrderID Revenue Quantity\n",
"count 2.000000 2.000000 2.000000\n",
"mean 43659.500000 12305.000000 7.000000\n",
"std 0.707107 15341.388725 7.071068\n",
"min 43659.000000 1457.000000 2.000000\n",
"25% 43659.250000 6881.000000 4.500000\n",
"50% 43659.500000 12305.000000 7.000000\n",
"75% 43659.750000 17729.000000 9.500000\n",
"max 43660.000000 23153.000000 12.000000"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales.describe()\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'plt' is not defined",
"output_type": "error",
"traceback": [
"\
u001b[1;31m------------------------------------------------------------------------
---\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback
(most recent call last)",
"Cell \u001b[1;32mIn[17], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \
u001b[43mplt\u001b[49m\u001b[38;5;241m.\u001b[39mhist(movies[\u001b[38;5;124m'\
u001b[39m\u001b[38;5;124mavg_rating\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\
u001b[0;32m 2\u001b[0m plt\u001b[38;5;241m.\u001b[39mtitle(\u001b[38;5;124m'\
u001b[39m\u001b[38;5;124mwhatever\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
"\u001b[1;31mNameError\u001b[0m: name 'plt' is not defined"
]
}
],
"source": [
"plt.hist(movies['avg_rating'])\n",
"plt.title('whatever')\n",
"> dt.date.today()\n",
"movies.drop(movies[movies['avg_rating']>5].index, inplace = True)\n",
"assert movies['avg_rating'].max()<=5\n",
"movies.loc[movies['avg_rating']>5,'avg_rating']=5\n",
"user_signups['subscription_date'] =
pd.to_datetime(user_signups['subscription_date']).dt.date\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Uniqueness constraints (chapter)\n",
"duplicates = height_weight.duplicated()\n",
"height_weight[duplicates]\n",
"\n",
"column_names = ['first_name','last_name','address']\n",
"duplicates = height_weight.duplicated(subset = column_names, keep = False)\n",
"height_weight[duplicates].sort_values(by = 'first_name')\n",
"\n",
"height_weight.drop_duplicates(inplace = True)\n",
"\n",
"column_names = ['first_name','last_name','address']\n",
"summaries = {'height':'max','weight':'mean'}\n",
"height_weight = height_weight.groupby(by =
column_names).agg(summaries).reset_index()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Origin",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "8ab9798c9dbdc377a44780d03c1df422753fd5bdb3f454154c68fcd412ec471b"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}