Data Visualization EDA-print
Data Visualization EDA-print
# Checking the shape of the education data (number of rows and columns)
education.shape
# Creating a histogram of GMAT scores with specified bins, color, and edge color
plt.hist(education.gmat, bins=[600, 680, 710, 740, 780], color='green', edgecolor="red")
# Creating a histogram of work experience with specified color, edge color, and number of
bins
plt.hist(education.workex, color='red', edgecolor="black", bins=6)
# Creating a density plot of GMAT scores using Seaborn with specified bandwidth and
filling the area under the curve
sns.kdeplot(education.gmat, bw=0.5, fill=True)
# Descriptive Statistics
# describe function will return descriptive statistics including the
# central tendency, dispersion and shape of a dataset's distribution.
# Creating a scatter plot of horsepower (HP) vs. miles per gallon (MPG)
plt.scatter(x=cars['HP'], y=cars['MPG'])
# Creating a scatter plot of horsepower (HP) vs. sale price (SP) with green color
plt.scatter(x=cars['HP'], y=cars['SP'], color='green')
# Reading data from a CSV file named "ethnic diversity.csv" located at "C:/Data/"
data = pd.read_csv(r"D:/New materials/EDA/InClass_DataPreprocessing_datasets/ethnic
diversity.csv")
'''
EmpID is Integer - Python automatically identify the data types by interpreting the values.
As the data for EmpID is numeric Python detects the values as int64.
From measurement levels prespective the EmpID is a Nominal data as it is an identity for
each employee.
If we have to alter the data type which is defined by Python then we can use astype()
function
'''
# Converting the 'Zip' column from its current type to 'str' (string) type
data.Zip = data.Zip.astype('str')
##############################################
### Identify duplicate records in the data ###
# Importing the pandas library for data manipulation and analysis
import pandas as pd
# Finding duplicate rows in the DataFrame and storing the result in a Boolean Series
duplicate = data.duplicated() # Returns Boolean Series denoting duplicate rows.
# Finding duplicate rows in the DataFrame and keeping the last occurrence of each
duplicated row
duplicate = data.duplicated(keep='last')
duplicate
# Removing duplicate rows from the DataFrame and storing the result in a new DataFrame
data1 = data.drop_duplicates() # Returns DataFrame with duplicate rows removed.
# Removing duplicate rows from the DataFrame and keeping the last occurrence of each
duplicated row
data1 = data.drop_duplicates(keep='last')
# Correlation coefficient
'''
Ranges from -1 to +1.
Rule of thumb says |r| > 0.85 is a strong relation
'''
# Calculating the correlation matrix for the columns in the DataFrame
cars.corr()
'''We can observe that the correlation value for HP and SP is 0.973 and VOL and WT is
0.999
& hence we can ignore one of the variables in these pairs.
'''
################################################
############## Outlier Treatment ###############
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
# Reading data from a CSV file named "ethnic diversity.csv" located at "C:/Data/"
df = pd.read_csv(r"D:/New materials/EDA/InClass_DataPreprocessing_datasets/ethnic
diversity.csv")
# Creating a box plot to visualize the distribution and potential outliers in the 'Salaries'
column
sns.boxplot(df.Salaries)
# Creating a box plot to visualize the distribution and potential outliers in the 'age' column
sns.boxplot(df.age)
# No outliers in age column
# Detection of outliers in the 'Salaries' column using the Interquartile Range (IQR) method
IQR = df['Salaries'].quantile(0.75) - df['Salaries'].quantile(0.25)
# Calculating the lower and upper limits for outlier detection based on IQR
lower_limit = df['Salaries'].quantile(0.25) - (IQR * 1.5)
upper_limit = df['Salaries'].quantile(0.75) + (IQR * 1.5)
# Creating a boolean array indicating whether each value in the 'Salaries' column is an
outlier
outliers_df = np.where(df.Salaries > upper_limit, True, np.where(df.Salaries < lower_limit,
True, False))
# Filtering the DataFrame to include only rows where 'Salaries' column contains outliers
df_out = df.loc[outliers_df, ]
# Displaying the shape of the original DataFrame and the trimmed DataFrame
df.shape, df_trimmed.shape
# Creating a box plot to visualize the distribution of 'Salaries' in the trimmed dataset
sns.boxplot(df_trimmed.Salaries)
# Fitting the Winsorizer model to the 'Salaries' column and transforming the data
df_s = winsor_iqr.fit_transform(df[['Salaries']])
# Creating a box plot to visualize the distribution of 'Salaries' after applying Winsorizer with
IQR method
sns.boxplot(df_s.Salaries)
# Fitting the Winsorizer model to the 'Salaries' column and transforming the data
df_t = winsor_gaussian.fit_transform(df[['Salaries']])
# Creating a box plot to visualize the distribution of 'Salaries' after applying Winsorizer with
Gaussian method
sns.boxplot(df_t.Salaries)
# Fitting the Winsorizer model to the 'Salaries' column and transforming the data
df_p = winsor_percentile.fit_transform(df[['Salaries']])
# Creating a box plot to visualize the distribution of 'Salaries' after applying Winsorizer with
quantiles method
sns.boxplot(df_p.Salaries)
##############################################
#### zero variance and near zero variance ####
# Reading data from a CSV file named "ethnic diversity.csv" located at "C:/Data/"
df = pd.read_csv(r"D:/New materials/EDA/InClass_DataPreprocessing_datasets/ethnic
diversity.csv")
# Checking if the variance of each numeric variable is equal to 0 and returning a boolean
Series
numeric_columns.var() == 0
# Checking if the variance of each numeric variable along axis 0 (columns) is equal to 0 and
returning a boolean Series
numeric_columns.var(axis=0) == 0
#############
# Discretization
# Reading data from a CSV file named "ethnic diversity.csv" located at "C:/Data/"
data = pd.read_csv(r"D:/New materials/EDA/InClass_DataPreprocessing_datasets/ethnic
diversity.csv")
# Displaying information about the DataFrame, including the data types of each column
and memory usage
data.info()
# Binarizing the 'Salaries' column into two categories ('Low' and 'High') based on custom
bins
data['Salaries_new'] = pd.cut(data['Salaries'],
bins=[min(data.Salaries), data.Salaries.mean(), max(data.Salaries)],
labels=["Low", "High"])
# Binarizing the 'Salaries' column into two categories ('Low' and 'High') based on custom
bins
# Parameters:
# - bins: Custom bins defined by the minimum salary value, mean salary value, and
maximum salary value
# - include_lowest: Whether to include the lowest edge of the bins in the intervals
# - labels: Labels assigned to the resulting categories
data['Salaries_new1'] = pd.cut(data['Salaries'],
bins=[min(data.Salaries), data.Salaries.mean(), max(data.Salaries)],
include_lowest=True,
labels=["Low", "High"])
#########
# Importing the matplotlib library for creating plots
import matplotlib.pyplot as plt
##################################################
################## Dummy Variables ###############
# methods:
# get dummies
# One Hot Encoding
# Label Encoding
# Ordinal Encoding
# Importing the pandas library for data manipulation and analysis
import pandas as pd
# Importing the numpy library for numerical computing
import numpy as np
# Reading data from a CSV file named "ethnic diversity.csv" located at "C:/Data/"
df = pd.read_csv(r"D:/New materials/EDA/InClass_DataPreprocessing_datasets/ethnic
diversity.csv")
# Displaying concise summary of the DataFrame including non-null counts and data types
df.info()
# Dropping the columns 'Employee_Name', 'EmpID', 'Zip' from the DataFrame and storing
the result in a new DataFrame
df1 = df.drop(['Employee_Name', 'EmpID', 'Zip'], axis=1)
# Dropping the columns 'Employee_Name', 'EmpID', 'Zip' from the DataFrame inplace
(modifying original DataFrame)
df.drop(['Employee_Name', 'EmpID', 'Zip'], axis=1, inplace=True)
# Creating dummy variables for categorical columns in the DataFrame and storing the
result in a new DataFrame
# The pd.get_dummies() function returns True and False values by default, but by applying
.astype('int64') to its output DataFrame, you convert these Boolean values to integers,
resulting in 1s and 0s.
df_new = pd.get_dummies(df).astype('int64')
# Creating dummy variables for categorical columns in the DataFrame and dropping the
first category of each column
df_new_1 = pd.get_dummies(df, drop_first=True).astype('int64')
# Selecting specific columns and updating the DataFrame with the selected columns
df = df[['Salaries', 'age', 'Position', 'State', 'Sex',
'MaritalDesc', 'CitizenDesc', 'EmploymentStatus', 'Department', 'Race']]
# Transforming the categorical columns (from Position column onwards) into one-hot
encoded format and converting to DataFrame
enc_df = pd.DataFrame(enc.fit_transform(df.iloc[:, 2:]), columns=
enc.get_feature_names_out(input_features=df.iloc[:, 2:].columns))
#######################
# Label Encoder
# Label Encoding is typically applied to a single column or feature at a time, meaning it
operates on one-dimensional data.
# Importing the LabelEncoder class from the sklearn.preprocessing module
from sklearn.preprocessing import LabelEncoder
########################
# Ordinal Encoding
# Importing the OrdinalEncoder class from the sklearn.preprocessing module
from sklearn.preprocessing import OrdinalEncoder
# Ordinal Encoding can handle multiple dimensions or features simultaneously.
oe = OrdinalEncoder()
# Data Split into Input and Output variables
# X contains the features (independent variables), excluding the last column
X = df.iloc[:, :9]
# y contains the target variable (dependent variable), which is the last column
y = df.iloc[:, 9]
# Loading the modified ethnic dataset from a CSV file located at "C:/Data/modified
ethnic.csv"
df = pd.read_csv(r'D:/New materials/EDA/InClass_DataPreprocessing_datasets/modified
ethnic.csv') # for doing modifications
# Checking for the count of missing values (NA's) in each column of the DataFrame
df.isna().sum()
# Mean Imputer: Replacing missing values in the 'Salaries' column with the mean value
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df["Salaries"] = pd.DataFrame(mean_imputer.fit_transform(df[["Salaries"]]))
df["Salaries"].isna().sum() # Checking for any remaining missing values in 'Salaries'
# Median Imputer: Replacing missing values in the 'age' column with the median value
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
df["age"] = pd.DataFrame(median_imputer.fit_transform(df[["age"]]))
df["age"].isna().sum() # Checking for any remaining missing values in 'age'
# Mode Imputer: Replacing missing values in the 'Sex' and 'MaritalDesc' columns with the
most frequent value
mode_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df["Sex"] = pd.DataFrame(mode_imputer.fit_transform(df[["Sex"]]))
df["MaritalDesc"] = pd.DataFrame(mode_imputer.fit_transform(df[["MaritalDesc"]]))
df.isnull().sum() # Checking for any remaining missing values
# Constant Value Imputer: Replacing missing values in the 'Sex' column with a constant
value 'F'
constant_imputer = SimpleImputer(missing_values=np.nan, strategy='constant',
fill_value='F')
df["Sex"] = pd.DataFrame(constant_imputer.fit_transform(df[["Sex"]]))
# Random Imputer: Replacing missing values in the 'age' column with random samples
from the same column
from feature_engine.imputation import RandomSampleImputer
random_imputer = RandomSampleImputer(['age'])
df["age"] = pd.DataFrame(random_imputer.fit_transform(df[["age"]]))
df["age"].isna().sum() # Checking for any remaining missing values in 'age'
#####################
# Normal Quantile-Quantile Plot
# Checking whether the 'gmat' data is normally distributed using a Q-Q plot
stats.probplot(education.gmat, dist="norm", plot=pylab)
# Checking whether the 'workex' data is normally distributed using a Q-Q plot
stats.probplot(education.workex, dist="norm", plot=pylab)
# Original data
prob = stats.probplot(education.workex, dist=stats.norm, plot=pylab)
# Transforming the 'workex' data using Box-Cox transformation and saving the lambda
value
fitted_data, fitted_lambda = stats.boxcox(education.workex)
# Creating subplots
fig, ax = plt.subplots(1, 2)
# Transformed data
prob = stats.probplot(fitted_data, dist=stats.norm, plot=pylab)
# Yeo-Johnson Transform
'''
We can apply it to our dataset without scaling the data.
It supports zero values and negative values. It does not require the values for
each input variable to be strictly positive.
# Original data
# Checking whether the 'workex' data is normally distributed using a Q-Q plot
prob = stats.probplot(education.workex, dist=stats.norm, plot=pylab)
# Transformed data
# Checking whether the transformed 'workex' data is normally distributed using a Q-Q plot
prob = stats.probplot(edu_tf.workex, dist=stats.norm, plot=pylab)
####################################################
######## Standardization and Normalization #########
# Normalization
''' Alternatively we can use the below function'''
# Importing MinMaxScaler from the sklearn.preprocessing module
from sklearn.preprocessing import MinMaxScaler
### Normalization
# Load dataset from a CSV file named "ethnic diversity.csv" located at "D:/Data/"
ethnic1 = pd.read_csv(r"D:/New
materials/EDA/InClass_DataPreprocessing_datasets/ethnic diversity.csv")
# Generating dummy variables for categorical columns in the dataset and dropping the
first category of each column
'''Robust Scaling
Scale features using statistics that are robust to outliers'''
2 + 2 # Function F9
# Works as calculator
# Read the CSV file into the 'Education' DataFrame using the specified file path
Education = pd.read_csv("C:/Users/education.csv")
# Auto EDA
# ---------
# Sweetviz
# Autoviz
# Dtale
# Pandas Profiling
# Dataprep
# Sweetviz
###########
#pip install sweetviz
# Import the sweetviz library
import sweetviz as sv
# Autoviz
###########
# pip install autoviz
# Import the AutoViz_Class from the autoviz package
from autoviz.AutoViz_Class import AutoViz_Class
# D-Tale
########
# pip install dtale # In case of any error then please install werkzeug appropriate version
(pip install werkzeug==2.0.3)
import dtale
import pandas as pd
# Read the CSV file into a DataFrame
df = pd.read_csv(r"D:/New
materials/EDA/InClass_DataPreprocessing_datasets/education.csv")
# Pandas Profiling
###################
p = ProfileReport(df)
# dataprep
###################
from dataprep.eda import create_report
# Generate a profile report using pandas-profiling