DATA SCIENCE EXPERIMENTS

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 31

DATA SCIENCE EXPERIMENTS

Experiment = 2
AIM: To plot the probability distribution curve using Python
CODE:
Normal Distribution
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

# Generate random data from a normal distribution


data = np.random.normal(loc=0, scale=1, size=1000) # mean=0, std=1, 1000
samples
# Plot the probability distribution using Seaborn
sns.histplot(data, kde=True, stat="density") # kde=True to plot the KDE curve
plt.title('Probability Distribution Curve')
plt.xlabel('Value')
plt.ylabel('Density')
plt.show()
# Generate the x values (range of values to plot)
x = np.linspace(-4, 4, 1000) # Range of values (mean ± 4 std)

# Calculate the probability density function (PDF) for a normal distribution


pdf = norm.pdf(x, loc=0, scale=1)
# Plot the PDF using Matplotlib
plt.plot(x, pdf, label='PDF', color='blue')

# Add title and labels


plt.title('Normal Distribution PDF')
plt.xlabel('Value')
plt.ylabel('Probability Density')

# Show the plot


plt.show()

OUTPUT:
Binomial Distribution
Code:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import binom
# Parameters for the binomial distribution
n = 10 # Number of trials
p = 0.5 # Probability of success
# Generate random data from a binomial distribution
data = np.random.binomial(n, p, 1000) # 1000 samples
# Plot the binomial distribution using Seabornsns.histplot(data, bins=range(n+2),
kde=False, stat="probability") # stat="probability" to show probability
plt.title('Binomial Distribution')
plt.xlabel('Number of successes')
plt.ylabel('Probability')
plt.show()
# Create a range of x values (possible outcomes)
x = np.arange(0, n+1)
# Compute the PMF for the binomial distribution
pmf = binom.pmf(x, n, p)
# Plot the PMF using Matplotlib
plt.vlines(x, 0, pmf, colors='b', lw=5) # Vertical lines for PMF
plt.plot(x, pmf, 'bo', ms=8) # Dots to show probabilities
plt.title('Binomial Distribution PMF')
plt.xlabel('Number of successes')
plt.ylabel('Probability')
plt.show()
OUTPUT:
Experiment= 3
AIM: To perform Chi square test on various data sets
1. Chi-Square Goodness-of-Fit Test
Code:
import scipy.stats as stats
import numpy as np

# No of years of experience of an employee


# Yearly Salary package in lakhs
experience_in_years= [8, 6, 10, 7, 8, 11, 9]
salary= [9, 8, 11, 8, 10, 7, 6]

# determining chi square goodness of fit using formula


chi_square_test_statistic1 = 0
for i in range(len(experience_in_years)):
chi_square_test_statistic1 = chi_square_test_statistic1 + \
(np.square(experience_in_years[i]-salary[i]))/salary[i]
print('chi square value determined by formula : ' +
str(chi_square_test_statistic1))
# find Chi-Square critical value
print(stats.chi2.ppf(1-0.05, df=6))

OUTPUT:
Plot Chi-Square Test:
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)


fig,ax = plt.subplots(1,1)

linestyles = [':', '--', '-.', '-']


deg_of_freedom = [1, 4, 7, 6]
for df, ls in zip(deg_of_freedom, linestyles):
ax.plot(x, stats.chi2.pdf(x, df), linestyle=ls)

plt.xlim(0, 10)
plt.ylim(0, 0.4)

plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Chi-Square Distribution')

plt.legend()
plt.show()
Output:
Experiment=4
AIM: To use python as a programming tool for analysis of data structure
Code:
# Define a list
my_list = [1, 2, 3, 4, 5]
# List operations
my_list.append(6) # Add an element to the end of the list
my_list.remove(3) # Remove an element from the list
my_list[0] = 10 # Modify the first element
import statistics
data = [23, 45, 67, 23, 89, 45, 23]
# Find basic statistical measures
mean = statistics.mean(data)
median = statistics.median(data)
mode = statistics.mode(data)
variance = statistics.variance(data)
print(f"Mean: {mean}, Median: {median}, Mode: {mode}, Variance: {variance}")
OUTPUT:
# Define a dictionary
word_count = {}
# Sample text
text = "Python is great. Python is versatile. Python is simple."
# Count the occurrences of each word
words = text.split()
for word in words:
word_count[word] = word_count.get(word, 0) + 1
print(word_count)
Example: Set Operations for Analysis
set_a = {1, 2, 3, 4}
set_b = {3, 4, 5, 6}
# Union: Elements in either set
print(set_a | set_b) # {1, 2, 3, 4, 5, 6}
# Intersection: Elements common to both sets
print(set_a & set_b) # {3, 4}
# Difference: Elements in set_a but not in set_b
print(set_a - set_b) # {1, 2}

# Using list as stack (Last In, First Out)


stack = []
stack.append(1) # Push
stack.append(2)
stack.append(3)
print(stack.pop()) # Output: 3 (Last In, First Out)
from collections import deque
# Using deque as queue (First In, First Out)
queue = deque([1, 2, 3])
queue.append(4)
print(queue.popleft()) # Output: 1 (First In, First Out)
Output
Experiment=5
AIM: To perform various operation such as data storage, analysis and
visusalization using Python.
Code:
1. Data Storage
Example 1: CSV Files (Comma-Separated Values)
import csv

# Writing data to a CSV file


data = [['Name', 'Age', 'City'], ['Alice', 30, 'New York'], ['Bob', 25, 'Los
Angeles']]

with open('data.csv', 'w', newline='') as file:


writer = csv.writer(file)
writer.writerows(data)

# Reading data from a CSV file


with open('data.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
print(row)
Example 2: Storing Data in SQLite Database
import sqlite3
# Connect to a SQLite database (or create one)
conn = sqlite3.connect('example.db')
# Create a cursor
cur = conn.cursor()
# Create a table
cur.execute('''CREATE TABLE IF NOT EXISTS students (name TEXT, age INTEGER)''')
# Insert data into the table
cur.execute("INSERT INTO students (name, age) VALUES ('Alice', 30)")
cur.execute("INSERT INTO students (name, age) VALUES ('Bob', 25)")
# Commit changes and close the connection
conn.commit()
conn.close()
Output
2. Data Analysis
Example 1: Data Analysis Using Panda
import pandas as pd
# Load data from a CSV file into a DataFrame
df = pd.read_csv('data.csv')
# Display the first few rows of the data
print(df.head())
# Perform basic data analysis
mean_age = df['Age'].mean() # Calculate the mean age
print(f'Mean Age: {mean_age}')
# Group data by 'City' and calculate the mean age
grouped_data = df.groupby('City')['Age'].mean()
print(grouped_data)

import numpy as np
# Create a NumPy array
data = np.array([10, 20, 30, 40, 50])

# Perform basic operations


mean = np.mean(data)
std_dev = np.std(data)
print(f"Mean: {mean}, Standard Deviation: {std_dev}")
Output:

3. Data Visualization
Example 1: Simple Plot Using Matplotlib
import matplotlib.pyplot as plt

# Data to plot
x = [1, 2, 3, 4, 5]
y = [10, 20, 30, 40, 50]

# Create a line plot


plt.plot(x, y, label='Data')
# Add labels and title
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('Simple Line Plot')
plt.legend()

# Show the plot


plt.show()

Example 2: Bar Plot and Histogram Using Seaborn


import seaborn as sns
import pandas as pd

# Example data
data = {'Category': ['A', 'B', 'C', 'D'], 'Values': [10, 25, 17, 30]}
df = pd.DataFrame(data)

# Create a bar plot


sns.barplot(x='Category', y='Values', data=df)
plt.title('Bar Plot')
plt.show()

# Generate a histogram
sns.histplot(df['Values'], bins=5)
plt.title('Histogram of Values')
plt.show()

Output
Experiment 6
AIM: To perform descriptive statistics analysis and Data Visualization
Code:
import numpy as np
from scipy import stats
# Create a dataset (you can use lists or NumPy arrays)
data = np.array([23, 45, 67, 23, 89, 45, 23, 67, 90, 56])

# Calculate mean, median, and mode


mean = np.mean(data)
median = np.median(data)
mode = stats.mode(data)

# Calculate variance and standard deviation


variance = np.var(data)
std_dev = np.std(data)

# Print the results


print(f'Mean: {mean}')
print(f'Median: {median}')
print(f'Mode: {mode.mode[0]}, Count: {mode.count[0]}')
print(f'Variance: {variance}')
print(f'Standard Deviation: {std_dev}')

Output:

Data Visualization
import pandas as pd
import matplotlib.pyplot as plt
# reading the csv data set
dataset = pd.read_csv("tips.csv")
Plotting Scatter plot of total_bill vs tip
plt.scatter(dataset['total_bill'], dataset['tip'])

# Giving our plot a title


plt.title("This is Scatter Plot")

GIving x and y labels names


plt.xlabel('Total_bill')
plt.ylabel('Tip')
plt.show()

# importing the required modules


import pandas as pd
import plotly.express
# reading the csv dataset through pandas
dataset = pd.read_csv("tips.csv")
# plotting our scatter plot
graph = plotly.express.scatter(dataset, x="total_bill", y="tip", color='smoker)
# displaying the plot created
graph.show()
Experiment=7
AIM: To perform principal component analysis on datasets.
Code:
# importing required libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# importing or loading the dataset
dataset = pd.read_csv('wine.csv')

# distributing the dataset into two components X and Y


X = dataset.iloc[:, 0:13].values
y = dataset.iloc[:, 13].values
# Splitting the X and Y into the
# Training set and Testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=0)
# performing preprocessing part
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Applying PCA function on training
# and testing set of X component
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

explained_variance = pca.explained_variance_ratio_
# Fitting Logistic Regression To the training set
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
# Predicting the test set result using
# predict function under LogisticRegression
y_pred = classifier.predict(X_test)
# making confusion matrix between
# test set of Y and predicted value.
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
# Predicting the training set
# result through scatter plot
from matplotlib.colors import ListedColormap

X_set, y_set = X_train, y_train


X1, X2 = np.meshgrid(np.arange(start=X_set[:, 0].min() - 1,
stop=X_set[:, 0].max() + 1, step=0.01),
np.arange(start=X_set[:, 1].min() - 1,
stop=X_set[:, 1].max() + 1, step=0.01))

plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(),


X2.ravel()]).T).reshape(X1.shape), alpha=0.75,
cmap=ListedColormap(('yellow', 'white', 'aquamarine')))

plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())

for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
color=ListedColormap(('red', 'green', 'blue'))(i), label=j)

plt.title('Logistic Regression (Training set)')


plt.xlabel('PC1') # for Xlabel
plt.ylabel('PC2') # for Ylabel
plt.legend() # to show legend

# show scatter plot


plt.show()

OUTPUT:
Experiment=8
AIM: To perform linear regression on datasets.
Code:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.core.common import random_state
from sklearn.linear_model import LinearRegression
# Get dataset
df_sal = pd.read_csv('/content/Salary_Data.csv')
df_sal.head()

# Describe data
df_sal.describe()

# Data distribution
plt.title('Salary Distribution Plot')
sns.distplot(df_sal['Salary'])
plt.show()
# Relationship between Salary and Experience
plt.scatter(df_sal['YearsExperience'], df_sal['Salary'], color = 'lightcoral')
plt.title('Salary vs Experience')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.box(False)
plt.show()

# Splitting variables
X = df_sal.iloc[:, :1] # independent
y = df_sal.iloc[:, 1:] # dependent
# Splitting dataset into test/train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state
= 0)
# Regressor model
regressor = LinearRegression()
regressor.fit(X_train, y_train)
# Prediction result
y_pred_test = regressor.predict(X_test) # predicted value of y_test
y_pred_train = regressor.predict(X_train) # predicted value of y_train

# Prediction on training set


plt.scatter(X_train, y_train, color = 'lightcoral')
plt.plot(X_train, y_pred_train, color = 'firebrick')
plt.title('Salary vs Experience (Training Set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.legend(['X_train/Pred(y_test)', 'X_train/y_train'], title = 'Sal/Exp', loc='best',
facecolor='white')
plt.box(False)
plt.show()

OUTPUT:
Experiment= 9
AIM: To perform Data Aggregation and group wise operations.
Code:
import pandas as pd
# Sample Data
data = {
'Country': ['India', 'India', 'USA', 'USA', 'Germany', 'Germany'],
'Year': [2020, 2021, 2020, 2021, 2020, 2021],
'Sales': [250, 300, 200, 210, 150, 180]
}
# Create DataFrame
df = pd.DataFrame(data)
print(df)

# Group by 'Country' and aggregate sum of 'Sales'


grouped_df = df.groupby('Country').agg({'Sales': 'sum'})
print(grouped_df)

# Group by 'Country' and 'Year' and find mean of 'Sales'


grouped_df = df.groupby(['Country', 'Year']).agg({'Sales': 'mean'})
print(grouped_df)

# Group by Country and apply multiple functions


grouped_df = df.groupby('Country').agg({
'Sales': ['sum', 'mean', 'max']
})
print(grouped_df)

OUTPUT:

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy