Outliers, Hypothesis and Natural Language Processing

Week-07 Outliers, Hypothesis and Natural Language Processing
[25]: import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
[26]: iris = pd.read_csv('iris.csv')

iris
[26]: sepal_length sepal_width petal_length petal_width species

0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
.. … … … … …
145 6.7 3.0 5.2 2.3 virginica
146 6.3 2.5 5.0 1.9 virginica
147 6.5 3.0 5.2 2.0 virginica
148 6.2 3.4 5.4 2.3 virginica
149 5.9 3.0 5.1 1.8 virginica
[150 rows x 5 columns]
[27]: iris.columns
[27]: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',

'species'],
dtype='object')
[28]: import pandas as pd

from sklearn.preprocessing import LabelEncoder
1
from sklearn.model_selection import train_test_split
[29]: target_column = 'species'

X = iris.drop(target_column, axis=1)
y = iris[target_column]
[30]: le = LabelEncoder()
y_encoded = le.fit_transform(y)
iris[target_column] = y_encoded
[31]: sns.heatmap(iris.corr(method='pearson').drop(
[], axis=1).drop([], axis=0),
annot = True);
plt.show()
2
#Treating Outliers
var = iris['sepal_width']
[34]: var
[34]: 0 3.5
1 3.0
2 3.2
3 3.1
4 3.6
…
145 3.0
146 2.5
147 3.0
148 3.4
149 3.0
Name: sepal_width, Length: 150, dtype: float64
[35]: q1 = np.percentile(var, 25)

q3 = np.percentile(var, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = [x for x in var if x < lower_bound or x > upper_bound]

outliers
[35]: [4.4, 4.1, 4.2, 2.0]
[36]: median_data = var.median()

median_data
[36]: 3.0
[37]: for i in range(len(var)):

if var[i] in outliers:
var[i] = median_data
print("Data with Outliers Replaced by Median:\n", var)
3
Data with Outliers Replaced by Median:
0 3.5
1 3.0
2 3.2
3 3.1
4 3.6
…
145 3.0
146 2.5
147 3.0
148 3.4
149 3.0
Name: sepal_width, Length: 150, dtype: float64
[38]: q1 = np.percentile(var, 25)

q3 = np.percentile(var, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = [x for x in var if x < lower_bound or x > upper_bound]

print(outliers)
[32]: import sweetviz as sv
[33]: advert_report = sv.analyze(iris)

#display the report
advert_report.show_html('Advertising.html')
Report Advertising.html was generated! NOTEBOOK/COLAB USERS: the web browser
MAY not pop up, regardless, the report IS saved in your notebook/colab files.
4
if(len(outliers) == 0):
print("No outliers.")
[]
No outliers.
Hypothesis
[39]: import numpy as np
from scipy.stats import kstest, norm
# Generate a sample of data that you want to test

np.random.seed(0) # Setting a seed for reproducibility
sample_data = np.random.normal(loc=0, scale=1, size=1000) # Sample data from a␣
↪normal distribution
# Perform a KS test to check if the sample_data follows a normal distribution

ks_statistic, p_value = kstest(var, 'norm')
# Define the significance level (alpha)

alpha = 0.05
# Check the result of the KS test

if p_value < alpha:
print(f"The data does NOT follow a normal distribution (p-value =␣
↪{p_value})")
else:
print(f"The data follows a normal distribution (p-value = {p_value})")
The data does NOT follow a normal distribution (p-value =

5.8803781394734095e-279)
[40]: # Generate a sample of data that you want to test

sample_data_1 = np.random.normal(0,1,100) # Sample data from a normal␣
↪distribution

ks_statistic, p_value = kstest(sample_data_1, 'norm')

alpha = 0.05

if p_value < alpha:
print(f"The sample does NOT follow a normal distribution (p-value =␣
↪{p_value})")
else:
5
print(f"The sample follows a normal distribution (p-value = {p_value})")
The sample follows a normal distribution (p-value = 0.8667717341286251)
[41]: # Generate a sample of data that you want to test

sample_data_2 = np.random.uniform(0,1,100) # Sample data from a normal␣
↪distribution

ks_statistic, p_value = kstest(sample_data_2, 'norm')

alpha = 0.05

if p_value < alpha:
print(f"The sample does NOT follow a normal distribution (p-value =␣
↪{p_value})")
else:
print(f"The sample follows a normal distribution (p-value = {p_value})")
The sample does NOT follow a normal distribution (p-value =

7.902176095057778e-24)
Natural Language Processing
[ ]: # This is related to convering a text in to vector
import pandas as pd
import numpy as np
import collections
import re
[ ]: #Sample documents
doc1 = 'Game of Thrones is an amazing tv series!, Game of Thrones is the best␣
↪tv series! and Game of Thrones is so great'
#Sentance without punctuations and split them

w_doc1= re.sub(r'[^\w\s]','', doc1.lower()).split()
# Print the sentence without punctuation
print(w_doc1)
['game', 'of', 'thrones', 'is', 'an', 'amazing', 'tv', 'series', 'game', 'of',
'thrones', 'is', 'the', 'best', 'tv', 'series', 'and', 'game', 'of', 'thrones',
'is', 'so', 'great']
[ ]: import nltk
from nltk.corpus import stopwords
6
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /root/nltk_data…

[nltk_data] Unzipping corpora/stopwords.zip.
True
[ ]: stop_words = set(stopwords.words('english'))
filtered_words = [word for word in w_doc1 if word.lower() not in stop_words]
# Reconstruct the text without stop words

filtered_text = ' '.join(filtered_words)
# Print the text without stop words

print(filtered_text)
game thrones amazing tv series game thrones best tv series game thrones great
[ ]: from sklearn.feature_extraction.text import CountVectorizer

doc1 = ['Game of Thrones is an amazing tv series!, Game of Thrones is the best␣
↪tv series! and Game of Thrones is so great']
# Create an instance of CountVectorizer

vectorizer = CountVectorizer()
# Fit the vectorizer on the sentences and transform them into a Bag of Words␣
↪representation
X = vectorizer.fit_transform(doc1)
# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()
# Convert the Bag of Words representation to a dense matrix and print it
print(X.toarray())
print("Feature names (words):", feature_names)
[[1 1 1 1 3 1 3 3 2 1 1 3 2]]
Feature names (words): ['amazing' 'an' 'and' 'best' 'game' 'great' 'is' 'of'
'series' 'so' 'the'
'thrones' 'tv']

Outliers, Hypothesis and Natural Language Processing

Uploaded by

Copyright:

Available Formats

Outliers, Hypothesis and Natural Language Processing

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Outliers, Hypothesis and Natural Language Processing

Uploaded by

Copyright:

Available Formats

Week-07 Outliers, Hypothesis and Natural Language Processing

[25]: import pandas as pd

[26]: iris = pd.read_csv('iris.csv')

[26]: sepal_length sepal_width petal_length petal_width species

[150 rows x 5 columns]

[27]: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',

[28]: import pandas as pd

[29]: target_column = 'species'

[35]: q1 = np.percentile(var, 25)

outliers = [x for x in var if x < lower_bound or x > upper_bound]

[35]: [4.4, 4.1, 4.2, 2.0]

[36]: median_data = var.median()

[37]: for i in range(len(var)):

print("Data with Outliers Replaced by Median:\n", var)

[38]: q1 = np.percentile(var, 25)

outliers = [x for x in var if x < lower_bound or x > upper_bound]

[32]: import sweetviz as sv

[33]: advert_report = sv.analyze(iris)

# Generate a sample of data that you want to test

# Perform a KS test to check if the sample_data follows a normal distribution

# Define the significance level (alpha)

# Check the result of the KS test

The data does NOT follow a normal distribution (p-value =

[40]: # Generate a sample of data that you want to test

# Perform a KS test to check if the sample_data follows a normal distribution

# Define the significance level (alpha)

# Check the result of the KS test

The sample follows a normal distribution (p-value = 0.8667717341286251)

[41]: # Generate a sample of data that you want to test

# Perform a KS test to check if the sample_data follows a normal distribution

# Define the significance level (alpha)

# Check the result of the KS test

The sample does NOT follow a normal distribution (p-value =

#Sentance without punctuations and split them

[nltk_data] Downloading package stopwords to /root/nltk_data…

# Reconstruct the text without stop words

# Print the text without stop words

[ ]: from sklearn.feature_extraction.text import CountVectorizer

# Create an instance of CountVectorizer

You might also like

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.