DATA SCIENCE EXPERIMENTS
DATA SCIENCE EXPERIMENTS
DATA SCIENCE EXPERIMENTS
Experiment = 2
AIM: To plot the probability distribution curve using Python
CODE:
Normal Distribution
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
OUTPUT:
Binomial Distribution
Code:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import binom
# Parameters for the binomial distribution
n = 10 # Number of trials
p = 0.5 # Probability of success
# Generate random data from a binomial distribution
data = np.random.binomial(n, p, 1000) # 1000 samples
# Plot the binomial distribution using Seabornsns.histplot(data, bins=range(n+2),
kde=False, stat="probability") # stat="probability" to show probability
plt.title('Binomial Distribution')
plt.xlabel('Number of successes')
plt.ylabel('Probability')
plt.show()
# Create a range of x values (possible outcomes)
x = np.arange(0, n+1)
# Compute the PMF for the binomial distribution
pmf = binom.pmf(x, n, p)
# Plot the PMF using Matplotlib
plt.vlines(x, 0, pmf, colors='b', lw=5) # Vertical lines for PMF
plt.plot(x, pmf, 'bo', ms=8) # Dots to show probabilities
plt.title('Binomial Distribution PMF')
plt.xlabel('Number of successes')
plt.ylabel('Probability')
plt.show()
OUTPUT:
Experiment= 3
AIM: To perform Chi square test on various data sets
1. Chi-Square Goodness-of-Fit Test
Code:
import scipy.stats as stats
import numpy as np
OUTPUT:
Plot Chi-Square Test:
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
plt.xlim(0, 10)
plt.ylim(0, 0.4)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Chi-Square Distribution')
plt.legend()
plt.show()
Output:
Experiment=4
AIM: To use python as a programming tool for analysis of data structure
Code:
# Define a list
my_list = [1, 2, 3, 4, 5]
# List operations
my_list.append(6) # Add an element to the end of the list
my_list.remove(3) # Remove an element from the list
my_list[0] = 10 # Modify the first element
import statistics
data = [23, 45, 67, 23, 89, 45, 23]
# Find basic statistical measures
mean = statistics.mean(data)
median = statistics.median(data)
mode = statistics.mode(data)
variance = statistics.variance(data)
print(f"Mean: {mean}, Median: {median}, Mode: {mode}, Variance: {variance}")
OUTPUT:
# Define a dictionary
word_count = {}
# Sample text
text = "Python is great. Python is versatile. Python is simple."
# Count the occurrences of each word
words = text.split()
for word in words:
word_count[word] = word_count.get(word, 0) + 1
print(word_count)
Example: Set Operations for Analysis
set_a = {1, 2, 3, 4}
set_b = {3, 4, 5, 6}
# Union: Elements in either set
print(set_a | set_b) # {1, 2, 3, 4, 5, 6}
# Intersection: Elements common to both sets
print(set_a & set_b) # {3, 4}
# Difference: Elements in set_a but not in set_b
print(set_a - set_b) # {1, 2}
import numpy as np
# Create a NumPy array
data = np.array([10, 20, 30, 40, 50])
3. Data Visualization
Example 1: Simple Plot Using Matplotlib
import matplotlib.pyplot as plt
# Data to plot
x = [1, 2, 3, 4, 5]
y = [10, 20, 30, 40, 50]
# Example data
data = {'Category': ['A', 'B', 'C', 'D'], 'Values': [10, 25, 17, 30]}
df = pd.DataFrame(data)
# Generate a histogram
sns.histplot(df['Values'], bins=5)
plt.title('Histogram of Values')
plt.show()
Output
Experiment 6
AIM: To perform descriptive statistics analysis and Data Visualization
Code:
import numpy as np
from scipy import stats
# Create a dataset (you can use lists or NumPy arrays)
data = np.array([23, 45, 67, 23, 89, 45, 23, 67, 90, 56])
Output:
Data Visualization
import pandas as pd
import matplotlib.pyplot as plt
# reading the csv data set
dataset = pd.read_csv("tips.csv")
Plotting Scatter plot of total_bill vs tip
plt.scatter(dataset['total_bill'], dataset['tip'])
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Applying PCA function on training
# and testing set of X component
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
# Fitting Logistic Regression To the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
# Predicting the test set result using
# predict function under LogisticRegression
y_pred = classifier.predict(X_test)
# making confusion matrix between
# test set of Y and predicted value.
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Predicting the training set
# result through scatter plot
from matplotlib.colors import ListedColormap
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
color=ListedColormap(('red', 'green', 'blue'))(i), label=j)
OUTPUT:
Experiment=8
AIM: To perform linear regression on datasets.
Code:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.core.common import random_state
from sklearn.linear_model import LinearRegression
# Get dataset
df_sal = pd.read_csv('/content/Salary_Data.csv')
df_sal.head()
# Describe data
df_sal.describe()
# Data distribution
plt.title('Salary Distribution Plot')
sns.distplot(df_sal['Salary'])
plt.show()
# Relationship between Salary and Experience
plt.scatter(df_sal['YearsExperience'], df_sal['Salary'], color = 'lightcoral')
plt.title('Salary vs Experience')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.box(False)
plt.show()
# Splitting variables
X = df_sal.iloc[:, :1] # independent
y = df_sal.iloc[:, 1:] # dependent
# Splitting dataset into test/train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state
= 0)
# Regressor model
regressor = LinearRegression()
regressor.fit(X_train, y_train)
# Prediction result
y_pred_test = regressor.predict(X_test) # predicted value of y_test
y_pred_train = regressor.predict(X_train) # predicted value of y_train
OUTPUT:
Experiment= 9
AIM: To perform Data Aggregation and group wise operations.
Code:
import pandas as pd
# Sample Data
data = {
'Country': ['India', 'India', 'USA', 'USA', 'Germany', 'Germany'],
'Year': [2020, 2021, 2020, 2021, 2020, 2021],
'Sales': [250, 300, 200, 210, 150, 180]
}
# Create DataFrame
df = pd.DataFrame(data)
print(df)
OUTPUT: