Outliers, Hypothesis and Natural Language Processing

Download as pdf or txt
Download as pdf or txt
You are on page 1of 7

Week-07 Outliers, Hypothesis and Natural Language Processing

[25]: import pandas as pd


import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

[26]: iris = pd.read_csv('iris.csv')


iris

[26]: sepal_length sepal_width petal_length petal_width species


0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
.. … … … … …
145 6.7 3.0 5.2 2.3 virginica
146 6.3 2.5 5.0 1.9 virginica
147 6.5 3.0 5.2 2.0 virginica
148 6.2 3.4 5.4 2.3 virginica
149 5.9 3.0 5.1 1.8 virginica

[150 rows x 5 columns]

[27]: iris.columns

[27]: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',


'species'],
dtype='object')

[28]: import pandas as pd


from sklearn.preprocessing import LabelEncoder

1
from sklearn.model_selection import train_test_split

[29]: target_column = 'species'


X = iris.drop(target_column, axis=1)
y = iris[target_column]

[30]: le = LabelEncoder()
y_encoded = le.fit_transform(y)
iris[target_column] = y_encoded

[31]: sns.heatmap(iris.corr(method='pearson').drop(
[], axis=1).drop([], axis=0),
annot = True);

plt.show()

2
#Treating Outliers
var = iris['sepal_width']
[34]: var

[34]: 0 3.5
1 3.0
2 3.2
3 3.1
4 3.6

145 3.0
146 2.5
147 3.0
148 3.4
149 3.0
Name: sepal_width, Length: 150, dtype: float64

[35]: q1 = np.percentile(var, 25)


q3 = np.percentile(var, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = [x for x in var if x < lower_bound or x > upper_bound]


outliers

[35]: [4.4, 4.1, 4.2, 2.0]

[36]: median_data = var.median()


median_data

[36]: 3.0

[37]: for i in range(len(var)):


if var[i] in outliers:
var[i] = median_data

print("Data with Outliers Replaced by Median:\n", var)

3
Data with Outliers Replaced by Median:
0 3.5
1 3.0
2 3.2
3 3.1
4 3.6

145 3.0
146 2.5
147 3.0
148 3.4
149 3.0
Name: sepal_width, Length: 150, dtype: float64

[38]: q1 = np.percentile(var, 25)


q3 = np.percentile(var, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = [x for x in var if x < lower_bound or x > upper_bound]


print(outliers)

[32]: import sweetviz as sv

[33]: advert_report = sv.analyze(iris)


#display the report
advert_report.show_html('Advertising.html')
Report Advertising.html was generated! NOTEBOOK/COLAB USERS: the web browser
MAY not pop up, regardless, the report IS saved in your notebook/colab files.

4
if(len(outliers) == 0):
print("No outliers.")

[]
No outliers.
Hypothesis
[39]: import numpy as np
from scipy.stats import kstest, norm

# Generate a sample of data that you want to test


np.random.seed(0) # Setting a seed for reproducibility
sample_data = np.random.normal(loc=0, scale=1, size=1000) # Sample data from a␣
↪normal distribution

# Perform a KS test to check if the sample_data follows a normal distribution


ks_statistic, p_value = kstest(var, 'norm')

# Define the significance level (alpha)


alpha = 0.05

# Check the result of the KS test


if p_value < alpha:
print(f"The data does NOT follow a normal distribution (p-value =␣
↪{p_value})")

else:
print(f"The data follows a normal distribution (p-value = {p_value})")

The data does NOT follow a normal distribution (p-value =


5.8803781394734095e-279)

[40]: # Generate a sample of data that you want to test


np.random.seed(0) # Setting a seed for reproducibility
sample_data_1 = np.random.normal(0,1,100) # Sample data from a normal␣
↪distribution

# Perform a KS test to check if the sample_data follows a normal distribution


ks_statistic, p_value = kstest(sample_data_1, 'norm')

# Define the significance level (alpha)


alpha = 0.05

# Check the result of the KS test


if p_value < alpha:
print(f"The sample does NOT follow a normal distribution (p-value =␣
↪{p_value})")

else:

5
print(f"The sample follows a normal distribution (p-value = {p_value})")

The sample follows a normal distribution (p-value = 0.8667717341286251)

[41]: # Generate a sample of data that you want to test


np.random.seed(0) # Setting a seed for reproducibility
sample_data_2 = np.random.uniform(0,1,100) # Sample data from a normal␣
↪distribution

# Perform a KS test to check if the sample_data follows a normal distribution


ks_statistic, p_value = kstest(sample_data_2, 'norm')

# Define the significance level (alpha)


alpha = 0.05

# Check the result of the KS test


if p_value < alpha:
print(f"The sample does NOT follow a normal distribution (p-value =␣
↪{p_value})")

else:
print(f"The sample follows a normal distribution (p-value = {p_value})")

The sample does NOT follow a normal distribution (p-value =


7.902176095057778e-24)
Natural Language Processing
[ ]: # This is related to convering a text in to vector
import pandas as pd
import numpy as np
import collections
import re

[ ]: #Sample documents
doc1 = 'Game of Thrones is an amazing tv series!, Game of Thrones is the best␣
↪tv series! and Game of Thrones is so great'

#Sentance without punctuations and split them


w_doc1= re.sub(r'[^\w\s]','', doc1.lower()).split()
# Print the sentence without punctuation
print(w_doc1)

['game', 'of', 'thrones', 'is', 'an', 'amazing', 'tv', 'series', 'game', 'of',
'thrones', 'is', 'the', 'best', 'tv', 'series', 'and', 'game', 'of', 'thrones',
'is', 'so', 'great']

[ ]: import nltk
from nltk.corpus import stopwords

6
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data…


[nltk_data] Unzipping corpora/stopwords.zip.
True

[ ]: stop_words = set(stopwords.words('english'))
filtered_words = [word for word in w_doc1 if word.lower() not in stop_words]

# Reconstruct the text without stop words


filtered_text = ' '.join(filtered_words)

# Print the text without stop words


print(filtered_text)

game thrones amazing tv series game thrones best tv series game thrones great

[ ]: from sklearn.feature_extraction.text import CountVectorizer


doc1 = ['Game of Thrones is an amazing tv series!, Game of Thrones is the best␣
↪tv series! and Game of Thrones is so great']

# Create an instance of CountVectorizer


vectorizer = CountVectorizer()
# Fit the vectorizer on the sentences and transform them into a Bag of Words␣
↪representation

X = vectorizer.fit_transform(doc1)
# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()
# Convert the Bag of Words representation to a dense matrix and print it
print(X.toarray())
print("Feature names (words):", feature_names)

[[1 1 1 1 3 1 3 3 2 1 1 3 2]]
Feature names (words): ['amazing' 'an' 'and' 'best' 'game' 'great' 'is' 'of'
'series' 'so' 'the'
'thrones' 'tv']

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy