0% found this document useful (0 votes)

12 views

Data Visualization EDA-print

Uploaded by

paluruanjana

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

12 views

Data Visualization EDA-print

Uploaded by

paluruanjana

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 18

# Data Visualization

import matplotlib.pyplot as plt # Library for creating plots and visualizations

import numpy as np # Library for numerical computing
import pandas as pd # Library for data manipulation and analysis
import seaborn as sns # Library for creating plots and visualizations

# Reading education data into Python from a CSV file

education = pd.read_csv(r"C:\Data\education.csv")

# Checking the shape of the education data (number of rows and columns)
education.shape

# Creating a bar plot of GMAT scores

plt.bar(height=education.gmat, x=np.arange(1, 774, 1)) # Initializing the parameters

# Creating a histogram of GMAT scores with default parameters

plt.hist(education.gmat) # histogram

# Creating a histogram of GMAT scores with specified bins, color, and edge color
plt.hist(education.gmat, bins=[600, 680, 710, 740, 780], color='green', edgecolor="red")

# Creating a histogram of work experience with default parameters

plt.hist(education.workex)

# Creating a histogram of work experience with specified color, edge color, and number of
bins
plt.hist(education.workex, color='red', edgecolor="black", bins=6)

# Getting help on the hist function

help(plt.hist)

# Using Seaborn to create a histogram (deprecated)

sns.distplot(education.gmat)

# Using Seaborn to create a histogram

sns.displot(education.gmat)

# Creating a box plot of GMAT scores

plt.boxplot(education.gmat)
plt.figure()

# Getting help on the boxplot function

help(plt.boxplot)

# Creating a density plot of GMAT scores using Seaborn

sns.kdeplot(education.gmat)

# Creating a density plot of GMAT scores using Seaborn with specified bandwidth and
filling the area under the curve
sns.kdeplot(education.gmat, bw=0.5, fill=True)

# Descriptive Statistics
# describe function will return descriptive statistics including the
# central tendency, dispersion and shape of a dataset's distribution.

# Displaying descriptive statistics of the education data

education.describe()
# Bivariate visualization
# Scatter plot

# Reading car data into Python from a CSV file

cars = pd.read_csv("C:/Data/Cars.csv")

# Displaying information about the car data

cars.info()

# Creating a scatter plot of horsepower (HP) vs. miles per gallon (MPG)
plt.scatter(x=cars['HP'], y=cars['MPG'])

# Creating a scatter plot of horsepower (HP) vs. sale price (SP) with green color
plt.scatter(x=cars['HP'], y=cars['SP'], color='green')

############# Data Pre-processing ##############

################ Type casting #################

# Importing the pandas library for data manipulation and analysis

import pandas as pd

# Reading data from a CSV file named "ethnic diversity.csv" located at "C:/Data/"
data = pd.read_csv(r"D:/New materials/EDA/InClass_DataPreprocessing_datasets/ethnic
diversity.csv")

# Displaying the data types of each column in the DataFrame

data.dtypes

'''
EmpID is Integer - Python automatically identify the data types by interpreting the values.
As the data for EmpID is numeric Python detects the values as int64.

From measurement levels prespective the EmpID is a Nominal data as it is an identity for
each employee.

If we have to alter the data type which is defined by Python then we can use astype()
function

'''

# Getting help on the astype method of pandas DataFrame

help(data.astype)

# Converting the 'EmpID' column from 'int64' to 'str' (string) type

data.EmpID = data.EmpID.astype('str')

# Displaying the data types after converting 'EmpID' column

data.dtypes

# Converting the 'Zip' column from its current type to 'str' (string) type
data.Zip = data.Zip.astype('str')

# Displaying the data types after converting 'Zip' column

data.dtypes
# For practice:
# Convert data types of columns from:

# Converting the 'Salaries' column from 'float64' to 'int64' type

data.Salaries = data.Salaries.astype('int64')

# Displaying the data types after converting 'Salaries' column

data.dtypes

# Converting the 'age' column from 'int' to 'float32' type

data.age = data.age.astype('float32')

# Displaying the data types after converting 'age' column

data.dtypes

##############################################
### Identify duplicate records in the data ###
# Importing the pandas library for data manipulation and analysis
import pandas as pd

# Reading data from a CSV file named "mtcars_dup.csv" located at "C:/Data/"

data = pd.read_csv(r"D:/New
materials/EDA/InClass_DataPreprocessing_datasets/mtcars_dup.csv")

# Getting help on the duplicated method of pandas DataFrame

help(data.duplicated)

# Finding duplicate rows in the DataFrame and storing the result in a Boolean Series
duplicate = data.duplicated() # Returns Boolean Series denoting duplicate rows.

# Displaying the Boolean Series indicating duplicate rows

duplicate

# Counting the total number of duplicate rows

sum(duplicate)

# Finding duplicate rows in the DataFrame and keeping the last occurrence of each
duplicated row
duplicate = data.duplicated(keep='last')
duplicate

# Finding all duplicate rows in the DataFrame

duplicate = data.duplicated(keep=False)
duplicate

# Removing duplicate rows from the DataFrame and storing the result in a new DataFrame
data1 = data.drop_duplicates() # Returns DataFrame with duplicate rows removed.

# Removing duplicate rows from the DataFrame and keeping the last occurrence of each
duplicated row
data1 = data.drop_duplicates(keep='last')

# Removing all duplicate rows from the DataFrame

data1 = data.drop_duplicates(keep=False)
# Duplicates in Columns
# We can use correlation coefficient values to identify columns which have duplicate
information

# Importing the pandas library for data manipulation and analysis

import pandas as pd

# Reading data from a CSV file named "Cars.csv" located at "C:/Data/"

cars = pd.read_csv(r"D:/New
materials/EDA/InClass_DataPreprocessing_datasets/Cars.csv")

# Correlation coefficient
'''
Ranges from -1 to +1.
Rule of thumb says |r| > 0.85 is a strong relation
'''
# Calculating the correlation matrix for the columns in the DataFrame
cars.corr()

'''We can observe that the correlation value for HP and SP is 0.973 and VOL and WT is
0.999
& hence we can ignore one of the variables in these pairs.
'''

################################################
############## Outlier Treatment ###############
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns

# Reading data from a CSV file named "ethnic diversity.csv" located at "C:/Data/"
df = pd.read_csv(r"D:/New materials/EDA/InClass_DataPreprocessing_datasets/ethnic
diversity.csv")

# Displaying the data types of each column in the DataFrame

df.dtypes

# Creating a box plot to visualize the distribution and potential outliers in the 'Salaries'
column
sns.boxplot(df.Salaries)

# Creating a box plot to visualize the distribution and potential outliers in the 'age' column
sns.boxplot(df.age)
# No outliers in age column

# Detection of outliers in the 'Salaries' column using the Interquartile Range (IQR) method
IQR = df['Salaries'].quantile(0.75) - df['Salaries'].quantile(0.25)

# Calculating the lower and upper limits for outlier detection based on IQR
lower_limit = df['Salaries'].quantile(0.25) - (IQR * 1.5)
upper_limit = df['Salaries'].quantile(0.75) + (IQR * 1.5)

############### 1. Remove (let's trim the dataset) ################

# Trimming Technique
# Let's flag the outliers in the dataset

# Creating a boolean array indicating whether each value in the 'Salaries' column is an
outlier
outliers_df = np.where(df.Salaries > upper_limit, True, np.where(df.Salaries < lower_limit,
True, False))

# Filtering the DataFrame to include only rows where 'Salaries' column contains outliers
df_out = df.loc[outliers_df, ]

# Filtering the DataFrame to exclude rows containing outliers

df_trimmed = df.loc[~(outliers_df), ]

# Displaying the shape of the original DataFrame and the trimmed DataFrame
df.shape, df_trimmed.shape

# Creating a box plot to visualize the distribution of 'Salaries' in the trimmed dataset
sns.boxplot(df_trimmed.Salaries)

############### 2. Replace ###############

# Replace the outliers by the maximum and minimum limit
# Creating a new column 'df_replaced' in the DataFrame with values replaced by upper or
lower limit if they are outliers
df['df_replaced'] = pd.DataFrame(np.where(df['Salaries'] > upper_limit, upper_limit,
np.where(df['Salaries'] < lower_limit, lower_limit, df['Salaries'])))

# Creating a box plot to visualize the distribution of 'df_replaced' column

sns.boxplot(df.df_replaced)

############### 3. Winsorization ###############

# pip install feature_engine # install the package
# Importing the Winsorizer class from the feature_engine.outliers module
from feature_engine.outliers import Winsorizer

# Defining the Winsorizer model with IQR method

# Parameters:
# - capping_method: 'iqr' specifies the Interquartile Range (IQR) method for capping
outliers
# - tail: 'both' indicates that both tails of the distribution will be capped
# - fold: 1.5 specifies the multiplier to determine the range for capping outliers based on
IQR
# - variables: ['Salaries'] specifies the column(s) in the DataFrame to apply the Winsorizer
to
winsor_iqr = Winsorizer(capping_method='iqr',
tail='both',
fold=1.5,
variables=['Salaries'])

# Fitting the Winsorizer model to the 'Salaries' column and transforming the data
df_s = winsor_iqr.fit_transform(df[['Salaries']])

# Inspect the minimum caps and maximum caps

# winsor.left_tail_caps_, winsor.right_tail_caps_

# Creating a box plot to visualize the distribution of 'Salaries' after applying Winsorizer with
IQR method
sns.boxplot(df_s.Salaries)

# Defining the Winsorizer model with Gaussian method

# Parameters:
# - capping_method: 'gaussian' specifies the Gaussian method for capping outliers
# - tail: 'both' indicates that both tails of the distribution will be capped
# - fold: 3 specifies the number of standard deviations to determine the range for capping
outliers based on Gaussian method
# - variables: ['Salaries'] specifies the column(s) in the DataFrame to apply the Winsorizer
to
winsor_gaussian = Winsorizer(capping_method='gaussian',
tail='both',
fold=3,
variables=['Salaries'])

# Fitting the Winsorizer model to the 'Salaries' column and transforming the data
df_t = winsor_gaussian.fit_transform(df[['Salaries']])

# Creating a box plot to visualize the distribution of 'Salaries' after applying Winsorizer with
Gaussian method
sns.boxplot(df_t.Salaries)

# Define the model with percentiles:

# Default values
# Right tail: 95th percentile
# Left tail: 5th percentile

# Defining the Winsorizer model with quantiles method

# Parameters:
# - capping_method: 'quantiles' specifies the quantiles method for capping outliers
# - tail: 'both' indicates that both tails of the distribution will be capped
# - fold: 0.05 specifies the proportion of data to be excluded from the lower and upper ends
of the distribution (5th and 95th percentiles)
# - variables: ['Salaries'] specifies the column(s) in the DataFrame to apply the Winsorizer
to
winsor_percentile = Winsorizer(capping_method='quantiles',
tail='both',
fold=0.05,
variables=['Salaries'])

# Fitting the Winsorizer model to the 'Salaries' column and transforming the data
df_p = winsor_percentile.fit_transform(df[['Salaries']])

# Creating a box plot to visualize the distribution of 'Salaries' after applying Winsorizer with
quantiles method
sns.boxplot(df_p.Salaries)

##############################################
#### zero variance and near zero variance ####

# Importing the pandas library for data manipulation and analysis

import pandas as pd

# Reading data from a CSV file named "ethnic diversity.csv" located at "C:/Data/"
df = pd.read_csv(r"D:/New materials/EDA/InClass_DataPreprocessing_datasets/ethnic
diversity.csv")

# Displaying the data types of each column in the DataFrame

df.dtypes
# If the variance is low or close to zero, then a feature is approximately constant and will
not improve the performance of the model.
# In that case, it should be removed.

# Select only numeric columns

numeric_columns = df.select_dtypes(include=np.number)

# Calculating the variance of each numeric variable in the DataFrame

numeric_columns.var()

# Checking if the variance of each numeric variable is equal to 0 and returning a boolean
Series
numeric_columns.var() == 0

# Checking if the variance of each numeric variable along axis 0 (columns) is equal to 0 and
returning a boolean Series
numeric_columns.var(axis=0) == 0

#############
# Discretization

# Importing the pandas library for data manipulation and analysis

import pandas as pd

# Reading data from a CSV file named "ethnic diversity.csv" located at "C:/Data/"
data = pd.read_csv(r"D:/New materials/EDA/InClass_DataPreprocessing_datasets/ethnic
diversity.csv")

# Displaying the first few rows of the DataFrame

data.head()

# Displaying the last few rows of the DataFrame

data.tail()

# Displaying information about the DataFrame, including the data types of each column
and memory usage
data.info()

# Generating descriptive statistics of the DataFrame, including count, mean, standard

deviation, minimum, maximum, and quartile values
data.describe()

# Binarizing the 'Salaries' column into two categories ('Low' and 'High') based on custom
bins
data['Salaries_new'] = pd.cut(data['Salaries'],
bins=[min(data.Salaries), data.Salaries.mean(), max(data.Salaries)],
labels=["Low", "High"])

# Counting the number of occurrences of each category in the 'Salaries_new' column

data.Salaries_new.value_counts()
''' We can observe that the total number of values are 309. This is because one of the value
has become NA.
This happens as the cut function by default does not consider the lowest (min) value while
discretizing the values.
To over come this issue we can use the parameter 'include_lowest' set to True.
'''

# Binarizing the 'Salaries' column into two categories ('Low' and 'High') based on custom
bins
# Parameters:
# - bins: Custom bins defined by the minimum salary value, mean salary value, and
maximum salary value
# - include_lowest: Whether to include the lowest edge of the bins in the intervals
# - labels: Labels assigned to the resulting categories
data['Salaries_new1'] = pd.cut(data['Salaries'],
bins=[min(data.Salaries), data.Salaries.mean(), max(data.Salaries)],
include_lowest=True,
labels=["Low", "High"])

# Counting the number of occurrences of each category in the 'Salaries_new1' column

data.Salaries_new1.value_counts()

#########
# Importing the matplotlib library for creating plots
import matplotlib.pyplot as plt

# Creating a bar plot to visualize the distribution of 'Salaries_new1' categories

plt.bar(x=range(310), height=data.Salaries_new1)

# Creating a histogram to visualize the distribution of 'Salaries_new1' categories

plt.hist(data.Salaries_new1)

# Creating a box plot to visualize the distribution of 'Salaries_new1' categories

plt.boxplot(data.Salaries_new1)

# Discretization into multiple bins based on quartiles

data['Salaries_multi'] = pd.cut(data['Salaries'],
bins=[min(data.Salaries),
data.Salaries.quantile(0.25),
data.Salaries.mean(),
data.Salaries.quantile(0.75),
max(data.Salaries)],
include_lowest=True,
labels=["P1", "P2", "P3", "P4"])

# Counting the number of occurrences of each category in the 'Salaries_multi' column

data.Salaries_multi.value_counts()

# Counting the number of occurrences of each category in the 'MaritalDesc' column

data.MaritalDesc.value_counts()

##################################################
################## Dummy Variables ###############
# methods:
# get dummies
# One Hot Encoding
# Label Encoding
# Ordinal Encoding
# Importing the pandas library for data manipulation and analysis
import pandas as pd
# Importing the numpy library for numerical computing
import numpy as np

# Reading data from a CSV file named "ethnic diversity.csv" located at "C:/Data/"
df = pd.read_csv(r"D:/New materials/EDA/InClass_DataPreprocessing_datasets/ethnic
diversity.csv")

# Displaying the names of all columns in the DataFrame

df.columns

# Displaying the shape of the DataFrame (number of rows and columns)

df.shape

# Displaying the data types of each column in the DataFrame

df.dtypes

# Displaying concise summary of the DataFrame including non-null counts and data types
df.info()

# Dropping the columns 'Employee_Name', 'EmpID', 'Zip' from the DataFrame and storing
the result in a new DataFrame
df1 = df.drop(['Employee_Name', 'EmpID', 'Zip'], axis=1)

# Dropping the columns 'Employee_Name', 'EmpID', 'Zip' from the DataFrame inplace
(modifying original DataFrame)
df.drop(['Employee_Name', 'EmpID', 'Zip'], axis=1, inplace=True)

# Creating dummy variables for categorical columns in the DataFrame and storing the
result in a new DataFrame
# The pd.get_dummies() function returns True and False values by default, but by applying
.astype('int64') to its output DataFrame, you convert these Boolean values to integers,
resulting in 1s and 0s.
df_new = pd.get_dummies(df).astype('int64')

# Creating dummy variables for categorical columns in the DataFrame and dropping the
first category of each column
df_new_1 = pd.get_dummies(df, drop_first=True).astype('int64')

##### One Hot Encoding works

# Displaying the names of all columns in the DataFrame
df.columns

# Selecting specific columns and updating the DataFrame with the selected columns
df = df[['Salaries', 'age', 'Position', 'State', 'Sex',
'MaritalDesc', 'CitizenDesc', 'EmploymentStatus', 'Department', 'Race']]

# Extracting the 'Salaries' column as a pandas Series

a = df['Salaries']

# Extracting the 'Salaries' column as a DataFrame

b = df[['Salaries']]
# Importing the OneHotEncoder class from the sklearn.preprocessing module
from sklearn.preprocessing import OneHotEncoder

# Creating an instance of the OneHotEncoder

enc = OneHotEncoder(sparse_output=False) # initializing method
# setting sparse_output=False explicitly instructs the OneHotEncoder to return a dense
array instead of a sparse matrix.

# Transforming the categorical columns (from Position column onwards) into one-hot
encoded format and converting to DataFrame
enc_df = pd.DataFrame(enc.fit_transform(df.iloc[:, 2:]), columns=
enc.get_feature_names_out(input_features=df.iloc[:, 2:].columns))

#######################
# Label Encoder
# Label Encoding is typically applied to a single column or feature at a time, meaning it
operates on one-dimensional data.
# Importing the LabelEncoder class from the sklearn.preprocessing module
from sklearn.preprocessing import LabelEncoder

# Creating an instance of the LabelEncoder

labelencoder = LabelEncoder()

# Data Split into Input and Output variables

# X contains the features (independent variables), excluding the last column
X = df.iloc[:, :9]
# y contains the target variable (dependent variable), which is the last column
y = df.iloc[:, 9]

# Transforming the 'Sex' column into numerical labels using LabelEncoder

X['Sex'] = labelencoder.fit_transform(X['Sex'])

# Transforming the 'MaritalDesc' column into numerical labels using LabelEncoder

X['MaritalDesc'] = labelencoder.fit_transform(X['MaritalDesc'])

# Transforming the 'CitizenDesc' column into numerical labels using LabelEncoder

X['CitizenDesc'] = labelencoder.fit_transform(X['CitizenDesc'])

########################
# Ordinal Encoding
# Importing the OrdinalEncoder class from the sklearn.preprocessing module
from sklearn.preprocessing import OrdinalEncoder
# Ordinal Encoding can handle multiple dimensions or features simultaneously.
oe = OrdinalEncoder()
# Data Split into Input and Output variables
# X contains the features (independent variables), excluding the last column
X = df.iloc[:, :9]
# y contains the target variable (dependent variable), which is the last column
y = df.iloc[:, 9]

X[['Sex', 'MaritalDesc', 'CitizenDesc']] = oe.fit_transform(X[['Sex', 'MaritalDesc',

'CitizenDesc']])

#################### Missing Values - Imputation ###########################

# Importing the necessary libraries
import numpy as np
import pandas as pd

# Loading the modified ethnic dataset from a CSV file located at "C:/Data/modified
ethnic.csv"
df = pd.read_csv(r'D:/New materials/EDA/InClass_DataPreprocessing_datasets/modified
ethnic.csv') # for doing modifications

# Checking for the count of missing values (NA's) in each column of the DataFrame
df.isna().sum()

# Create an imputer object that fills 'Nan' values

# Mean and Median imputer are used for numeric data (Salaries)
# Mode is used for discrete data (ex: Position, Sex, MaritalDesc)

# Importing SimpleImputer from the sklearn.impute module

from sklearn.impute import SimpleImputer

# Mean Imputer: Replacing missing values in the 'Salaries' column with the mean value
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df["Salaries"] = pd.DataFrame(mean_imputer.fit_transform(df[["Salaries"]]))
df["Salaries"].isna().sum() # Checking for any remaining missing values in 'Salaries'

# Median Imputer: Replacing missing values in the 'age' column with the median value
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
df["age"] = pd.DataFrame(median_imputer.fit_transform(df[["age"]]))
df["age"].isna().sum() # Checking for any remaining missing values in 'age'

# Mode Imputer: Replacing missing values in the 'Sex' and 'MaritalDesc' columns with the
most frequent value
mode_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df["Sex"] = pd.DataFrame(mode_imputer.fit_transform(df[["Sex"]]))
df["MaritalDesc"] = pd.DataFrame(mode_imputer.fit_transform(df[["MaritalDesc"]]))
df.isnull().sum() # Checking for any remaining missing values

# Constant Value Imputer: Replacing missing values in the 'Sex' column with a constant
value 'F'
constant_imputer = SimpleImputer(missing_values=np.nan, strategy='constant',
fill_value='F')
df["Sex"] = pd.DataFrame(constant_imputer.fit_transform(df[["Sex"]]))

# Random Imputer: Replacing missing values in the 'age' column with random samples
from the same column
from feature_engine.imputation import RandomSampleImputer

random_imputer = RandomSampleImputer(['age'])
df["age"] = pd.DataFrame(random_imputer.fit_transform(df[["age"]]))
df["age"].isna().sum() # Checking for any remaining missing values in 'age'

#####################
# Normal Quantile-Quantile Plot

# Importing pandas library for data manipulation

import pandas as pd

# Reading data from a CSV file named "education.csv" located at "C:/Data/"

education = pd.read_csv(r"D:/New
materials/EDA/InClass_DataPreprocessing_datasets/education.csv")

# Importing scipy.stats module for statistical functions

import scipy.stats as stats
# Importing pylab module for creating plots
import pylab

# Checking whether the 'gmat' data is normally distributed using a Q-Q plot
stats.probplot(education.gmat, dist="norm", plot=pylab)

# Checking whether the 'workex' data is normally distributed using a Q-Q plot
stats.probplot(education.workex, dist="norm", plot=pylab)

# Importing numpy module for numerical computations

import numpy as np

# Transformation to make 'workex' variable normal by applying logarithmic transformation

stats.probplot(np.log(education.workex), dist="norm", plot=pylab)

# Importing seaborn and matplotlib.pyplot for plotting

import seaborn as sns
import matplotlib.pyplot as plt

# Original data
prob = stats.probplot(education.workex, dist=stats.norm, plot=pylab)

# Transforming the 'workex' data using Box-Cox transformation and saving the lambda
value
fitted_data, fitted_lambda = stats.boxcox(education.workex)

# Creating subplots
fig, ax = plt.subplots(1, 2)

# Plotting the original and transformed data distributions

sns.distplot(education.workex, hist=False, kde=True,
kde_kws={'shade': True, 'linewidth': 2},
label="Non-Normal", color="green", ax=ax[0])

sns.distplot(fitted_data, hist=False, kde=True,

kde_kws={'shade': True, 'linewidth': 2},
label="Normal", color="green", ax=ax[1])

# Adding legends to the subplots

plt.legend(loc="upper right")

# Rescaling the subplots

fig.set_figheight(5)
fig.set_figwidth(10)

# Printing the lambda value used for transformation

print(f"Lambda value used for Transformation: {fitted_lambda}")

# Transformed data
prob = stats.probplot(fitted_data, dist=stats.norm, plot=pylab)
# Yeo-Johnson Transform

'''
We can apply it to our dataset without scaling the data.
It supports zero values and negative values. It does not require the values for
each input variable to be strictly positive.

In Box-Cox transform the input variable has to be positive.

'''

# Importing pandas library for data manipulation

import pandas as pd
# Importing stats module from scipy library for statistical functions
from scipy import stats

# Importing seaborn and matplotlib.pyplot for plotting

import seaborn as sns
import matplotlib.pyplot as plt
# Importing pylab module for creating plots
import pylab

# Read data from a CSV file named "education.csv" located at "C:/Data/"

education = pd.read_csv(r"D:/New
materials/EDA/InClass_DataPreprocessing_datasets/education.csv")

# Original data
# Checking whether the 'workex' data is normally distributed using a Q-Q plot
prob = stats.probplot(education.workex, dist=stats.norm, plot=pylab)

# Importing transformation module from feature_engine library

from feature_engine import transformation

# Set up the Yeo-Johnson transformer for 'workex' variable

tf = transformation.YeoJohnsonTransformer(variables='workex')

# Transforming the 'workex' variable using Yeo-Johnson transformation

edu_tf = tf.fit_transform(education)

# Transformed data
# Checking whether the transformed 'workex' data is normally distributed using a Q-Q plot
prob = stats.probplot(edu_tf.workex, dist=stats.norm, plot=pylab)

####################################################
######## Standardization and Normalization #########

# Importing pandas library for data manipulation

import pandas as pd
# Importing numpy library for numerical computations
import numpy as np

# Reading data from a CSV file named "mtcars.csv" located at "D:/Data/"

data = pd.read_csv(r"D:/New
materials/EDA/InClass_DataPreprocessing_datasets/mtcars.csv")

# Generating descriptive statistics of the original data

a = data.describe()

# Importing StandardScaler from the sklearn.preprocessing module

from sklearn.preprocessing import StandardScaler

# Initialise the StandardScaler

scaler = StandardScaler()

# Scaling the data using StandardScaler

df = scaler.fit_transform(data)

# Converting the scaled array back to a DataFrame

dataset = pd.DataFrame(df)

# Generating descriptive statistics of the scaled data

res = dataset.describe()

# Normalization
''' Alternatively we can use the below function'''
# Importing MinMaxScaler from the sklearn.preprocessing module
from sklearn.preprocessing import MinMaxScaler

# Initializing the MinMaxScaler

minmaxscale = MinMaxScaler()

# Scaling the data using MinMaxScaler

df_n = minmaxscale.fit_transform(df)

# Converting the scaled array back to a DataFrame

dataset1 = pd.DataFrame(df_n)

# Generating descriptive statistics of the scaled data

res1 = dataset1.describe()

### Normalization
# Load dataset from a CSV file named "ethnic diversity.csv" located at "D:/Data/"
ethnic1 = pd.read_csv(r"D:/New
materials/EDA/InClass_DataPreprocessing_datasets/ethnic diversity.csv")

# Displaying column names of the dataset

ethnic1.columns

# Dropping columns 'Employee_Name', 'EmpID', 'Zip' from the dataset

ethnic1.drop(['Employee_Name', 'EmpID', 'Zip'], axis=1, inplace=True)

# Generating descriptive statistics of the original dataset

a1 = ethnic1.describe()

# Generating dummy variables for categorical columns in the dataset and dropping the
first category of each column

ethnic = pd.get_dummies(ethnic1, drop_first=True).astype(int)

# Generating descriptive statistics of the dataset with dummy variables
a2 = ethnic.describe()
### Normalization function - Custom Function
# Range converts values to range between 0 and 1
def norm_func(i):
x = (i - i.min()) / (i.max() - i.min())
return x

# Applying normalization function to the dataset

df_norm = norm_func(ethnic)

# Generating descriptive statistics of the normalized dataset

b = df_norm.describe()

''' Alternatively we can use the below function'''

# Importing MinMaxScaler from the sklearn.preprocessing module
from sklearn.preprocessing import MinMaxScaler

# Initializing the MinMaxScaler

minmaxscale = MinMaxScaler()

# Scaling the dataset using MinMaxScaler

ethnic_minmax = minmaxscale.fit_transform(ethnic)

# Converting the scaled array back to a DataFrame

df_ethnic = pd.DataFrame(ethnic_minmax)

# Generating descriptive statistics of the dataset after Min-Max scaling

minmax_res = df_ethnic.describe()

'''Robust Scaling
Scale features using statistics that are robust to outliers'''

# Importing RobustScaler from the sklearn.preprocessing module

from sklearn.preprocessing import RobustScaler

# Initializing the RobustScaler

robust_model = RobustScaler()

# Scaling the dataset using RobustScaler

df_robust = robust_model.fit_transform(ethnic)

# Converting the scaled array back to a DataFrame

dataset_robust = pd.DataFrame(df_robust)

# Generating descriptive statistics of the dataset after Robust scaling

res_robust = dataset_robust.describe()

2 + 2 # Function F9
# Works as calculator

# Python Libraries (Packages)

# pip install <package name> - To install library (package), execute the code in Command
prompt
# pip install pandas
import pandas as pd

# List all the attributes and methods of the pandas module

dir(pd)

# Read data into Python from a CSV file

# Read the CSV file into the 'education' DataFrame using the specified file path
education = pd.read_csv(r"C:\Users\education.csv")

# Read the CSV file into the 'Education' DataFrame using the specified file path
Education = pd.read_csv("C:/Users/education.csv")

# Declare some variables

A = 10
a = 10.1

# Display information about the DataFrame "education"

education.info()

# C:\Users\education.csv - this is windows default file path with a '\'

# C:\\Users\\education.csv - change it to '\\' to make it work in Python

# Exploratory Data Analysis

# Measures of Central Tendency / First moment business decision
# Calculate the mean of the 'workex' column in the 'education' DataFrame
education.workex.mean()

# Calculate the median of the 'workex' column in the 'education' DataFrame

education.workex.median()

# Calculate the mode of the 'workex' column in the 'education' DataFrame

education.workex.mode()

# Measures of Dispersion / Second moment business decision

education.workex.var() # variance
education.workex.std() # standard deviation
range = max(education.workex) - min(education.workex) # range
range

# Third moment business decision

# Calculate the skewness of the 'workex' column in the 'education' DataFrame
education.workex.skew()

# Calculate the skewness of the 'gmat' column in the 'education' DataFrame

education.gmat.skew()

# Fourth moment business decision

# Calculate the kurtosis of the 'workex' column in the 'education' DataFrame
education.workex.kurt()

# Load the Data

# Import the pandas library
import pandas as pd
# Read the CSV file into a DataFrame
df = pd.read_csv(r"D:/EDA/InClass_DataPreprocessing_datasets/education.csv")

# Auto EDA
# ---------
# Sweetviz
# Autoviz
# Dtale
# Pandas Profiling
# Dataprep

# Sweetviz
###########
#pip install sweetviz
# Import the sweetviz library
import sweetviz as sv

# Analyze the DataFrame and generate a report

s = sv.analyze(df)

# Display the report in HTML format

s.show_html()

# Autoviz
###########
# pip install autoviz
# Import the AutoViz_Class from the autoviz package
from autoviz.AutoViz_Class import AutoViz_Class

# Create an instance of AutoViz_Class

av = AutoViz_Class()

# Generate visualizations for the dataset

a = av.AutoViz(r"D:/Data/education.csv", chart_format='html')

# Get the current working directory

import os
os.getcwd()

# If the dependent variable is known:

a = av.AutoViz(r"C:/Data/education.csv", depVar = 'gmat') # depVar - target variable in
your dataset

# D-Tale
########

# pip install dtale # In case of any error then please install werkzeug appropriate version
(pip install werkzeug==2.0.3)
import dtale
import pandas as pd
# Read the CSV file into a DataFrame
df = pd.read_csv(r"D:/New
materials/EDA/InClass_DataPreprocessing_datasets/education.csv")

# Display the DataFrame using D-Tale

d = dtale.show(df)

# Open the browser to view the interactive D-Tale dashboard

d.open_browser()

# Pandas Profiling
###################

# pip install pandas_profiling

from pandas_profiling import ProfileReport

p = ProfileReport(df)

# Display the profile report

# Save the profile report to an HTML file

p.to_file("output.html")

import os#importing the os module

# Get the current working directory
os.getcwd()

# dataprep
###################
from dataprep.eda import create_report
# Generate a profile report using pandas-profiling

# Generate an EDA report using dataprep

report = create_report(df, title='My Report')

# Show the EDA report in the browser

report.show_browser()

Delhivery Mani
No ratings yet
Delhivery Mani
79 pages
Rcem Learning FRCEM PRIMARY Paper 3 PDF
0% (2)
Rcem Learning FRCEM PRIMARY Paper 3 PDF
8 pages
DP
No ratings yet
DP
9 pages
2,3. Introduction Pandas & Matplotlib - Copy
No ratings yet
2,3. Introduction Pandas & Matplotlib - Copy
32 pages
EDP-3[2]
No ratings yet
EDP-3[2]
16 pages
Data Preprocessing & Visualization1
No ratings yet
Data Preprocessing & Visualization1
2 pages
AL Notes
No ratings yet
AL Notes
61 pages
L6 and 7-Data Preprocessing-coding
No ratings yet
L6 and 7-Data Preprocessing-coding
34 pages
Lesson 2 - Data Preprocessing
100% (1)
Lesson 2 - Data Preprocessing
72 pages
Week-6 DS Practical
No ratings yet
Week-6 DS Practical
12 pages
ML Lab Records
No ratings yet
ML Lab Records
101 pages
AIML LAB MANAUAL R23
100% (1)
AIML LAB MANAUAL R23
10 pages
EDA_CODE_SNIPPETS
No ratings yet
EDA_CODE_SNIPPETS
17 pages
ML Expt 1 Description
No ratings yet
ML Expt 1 Description
15 pages
EDS - Python Cheat Sheet
0% (1)
EDS - Python Cheat Sheet
3 pages
PythonForMachineLearning
No ratings yet
PythonForMachineLearning
66 pages
Data Visualization
No ratings yet
Data Visualization
13 pages
Pandas Complete + Visualisation Summary of IBM Visualization
No ratings yet
Pandas Complete + Visualisation Summary of IBM Visualization
21 pages
Summary: Introduction To Data Visualization Tools
No ratings yet
Summary: Introduction To Data Visualization Tools
13 pages
Unit - Iii - Eda
No ratings yet
Unit - Iii - Eda
25 pages
ML_EX2
No ratings yet
ML_EX2
7 pages
Machine Learning
No ratings yet
Machine Learning
67 pages
Machine Learning Lab Manual (1)
No ratings yet
Machine Learning Lab Manual (1)
42 pages
3rd Semester DDM AI DAA DEV Print Pages For Spiral Record 25-1-24 - Removed
No ratings yet
3rd Semester DDM AI DAA DEV Print Pages For Spiral Record 25-1-24 - Removed
28 pages
Usage of NumPy for Numerical Data in Detail
No ratings yet
Usage of NumPy for Numerical Data in Detail
52 pages
Project paarth (1) (1)
No ratings yet
Project paarth (1) (1)
21 pages
Data Exploration in Python PDF
No ratings yet
Data Exploration in Python PDF
1 page
DSI237_GROUP_2
No ratings yet
DSI237_GROUP_2
27 pages
ModuleAr Merged
No ratings yet
ModuleAr Merged
42 pages
Practical_1
No ratings yet
Practical_1
5 pages
Data_Analyzer
No ratings yet
Data_Analyzer
10 pages
Pandas Cheat Sheet
No ratings yet
Pandas Cheat Sheet
17 pages
Data Mining Lab 03
No ratings yet
Data Mining Lab 03
10 pages
Phython Example
No ratings yet
Phython Example
12 pages
CSE445 NSU Week_3
No ratings yet
CSE445 NSU Week_3
48 pages
Lab Record IP
No ratings yet
Lab Record IP
13 pages
Python Data Science 101
100% (1)
Python Data Science 101
41 pages
DP prog
No ratings yet
DP prog
10 pages
Data science and analtics Laboratory
No ratings yet
Data science and analtics Laboratory
21 pages
Social Network Analysis: Cheruvu Nvss Suhas 21BCE8374
No ratings yet
Social Network Analysis: Cheruvu Nvss Suhas 21BCE8374
10 pages
12 Useful Pandas Techniques in Python For Data Manipulation
100% (2)
12 Useful Pandas Techniques in Python For Data Manipulation
19 pages
Data Pre Processing
No ratings yet
Data Pre Processing
2 pages
Chapter 2 - Python Pandas II
No ratings yet
Chapter 2 - Python Pandas II
71 pages
04 DS 2023
No ratings yet
04 DS 2023
63 pages
Data Assigment 1
100% (2)
Data Assigment 1
32 pages
CheatSheet
No ratings yet
CheatSheet
15 pages
Lec ExploratoryDataAnalysis1Unit5Part1
No ratings yet
Lec ExploratoryDataAnalysis1Unit5Part1
22 pages
DALab Part-B BCU&BU
No ratings yet
DALab Part-B BCU&BU
12 pages
EXP-3
No ratings yet
EXP-3
10 pages
Machine Learning Project Roadmap
No ratings yet
Machine Learning Project Roadmap
4 pages
Data Exploration Preparation
No ratings yet
Data Exploration Preparation
12 pages
EXP-2
No ratings yet
EXP-2
6 pages
Employee Info
No ratings yet
Employee Info
2 pages
Image to pdf 22-Jan-2025
No ratings yet
Image to pdf 22-Jan-2025
6 pages
Exploratory Data Analysis: by Neha Mathur
No ratings yet
Exploratory Data Analysis: by Neha Mathur
14 pages
Pandas,Numpy,Matplotlib
No ratings yet
Pandas,Numpy,Matplotlib
11 pages
DSBDAL
No ratings yet
DSBDAL
87 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Scala Data Analysis Cookbook (new): Navigate the world of data analysis, visualization, and machine learning with over 100 hands-on Scala recipes
From Everand
Scala Data Analysis Cookbook (new): Navigate the world of data analysis, visualization, and machine learning with over 100 hands-on Scala recipes
Arun Manivannan
No ratings yet
Introduction to PHP, Part 2, Second Edition
From Everand
Introduction to PHP, Part 2, Second Edition
Adam Majczak
No ratings yet
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Reduced Syllabus of JEE Main 2024 - Chemistry-1698910649896
No ratings yet
Reduced Syllabus of JEE Main 2024 - Chemistry-1698910649896
8 pages
Like The Molave
No ratings yet
Like The Molave
13 pages
Catalog in Sap PM
100% (1)
Catalog in Sap PM
7 pages
Accused Proforma
No ratings yet
Accused Proforma
7 pages
Bosch Mixer Grinder Manual
No ratings yet
Bosch Mixer Grinder Manual
16 pages
Britannica English Scope and Sequence Grid
No ratings yet
Britannica English Scope and Sequence Grid
2 pages
COGS N Bjork Analysis
No ratings yet
COGS N Bjork Analysis
97 pages
Basic Concepts of Accounting Information Systems
No ratings yet
Basic Concepts of Accounting Information Systems
9 pages
Chapter 5
No ratings yet
Chapter 5
8 pages
Chap 012 BB
No ratings yet
Chap 012 BB
8 pages
Q4 Science Las 5
No ratings yet
Q4 Science Las 5
3 pages
Project Management Coursework Sample
100% (2)
Project Management Coursework Sample
5 pages
L6 NVQ COMP CCOM Iss3-2
No ratings yet
L6 NVQ COMP CCOM Iss3-2
243 pages
Resume - ASHWINI PATIL - 09.12.23
No ratings yet
Resume - ASHWINI PATIL - 09.12.23
4 pages
Y3 Mac Test 2016
No ratings yet
Y3 Mac Test 2016
8 pages
Sex Scandal (Editorial)
No ratings yet
Sex Scandal (Editorial)
1 page
Naval
No ratings yet
Naval
2 pages
LRNE01S006XC - LRNE01S006XN - LRNE01S006XW (1) - Copy
No ratings yet
LRNE01S006XC - LRNE01S006XN - LRNE01S006XW (1) - Copy
3 pages
POM Unit-1
No ratings yet
POM Unit-1
19 pages
Chapter 6 Exergy Analysis of Psychrometric Processes 2013 Exergy Second Edition
No ratings yet
Chapter 6 Exergy Analysis of Psychrometric Processes 2013 Exergy Second Edition
13 pages
Installation Instruction: Shrink Discs of The Types TAS 30.. / TAS 52.
No ratings yet
Installation Instruction: Shrink Discs of The Types TAS 30.. / TAS 52.
4 pages
Laravel For Web Artisans
No ratings yet
Laravel For Web Artisans
72 pages
Marketing Plan For Dilmah: S. Anushan Batch 20
100% (1)
Marketing Plan For Dilmah: S. Anushan Batch 20
28 pages
Floods in Kelantan
No ratings yet
Floods in Kelantan
6 pages
Hurun India Rich List
100% (3)
Hurun India Rich List
21 pages
Leadership
50% (2)
Leadership
19 pages
SAP Security Specialist Resume
No ratings yet
SAP Security Specialist Resume
2 pages
Exp 6
No ratings yet
Exp 6
11 pages
Answers To E3 PDF
No ratings yet
Answers To E3 PDF
5 pages

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.