Mds1111 Merged Numbered (1)
Mds1111 Merged Numbered (1)
# From a list
list_data = [1, 2, 3, 4, 5]
array_from_list = np.array(list_data)
print("Array from list:", array_from_list)
# From a tuple
tuple_data = (6, 7, 8, 9, 10)
array_from_tuple = np.array(tuple_data)
print("Array from tuple:", array_from_tuple)
# Multi-dimensional list
multi_list_data = [[1, 2, 3], [4, 5, 6]]
multi_array = np.array(multi_list_data)
print("Multi-dimensional array:", multi_array)
OUTPUT:-
1
# Array of zeros
zeros_array = np.zeros((3, 3))
print("Array of zeros:\n", zeros_array)
# Array of ones
ones_array = np.ones((2, 4))
print("Array of ones:\n", ones_array)
# Identity matrix
identity_matrix = np.eye(3)
print("Identity matrix:\n", identity_matrix)
OUTPUT:-
2
# Normal distribution (mean=0, std=1)
normal_dist_array = np.random.randn(3, 3)
print("Array from normal distribution:\n", normal_dist_array)
OUTPUT:-
# 1D array indexing
array_1d = np.array([10, 20, 30, 40, 50])
print("Element at index 2:", array_1d[2])
# 2D array indexing
array_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print("Element at row 1, column 2:", array_2d[1, 2])
arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
print(arr[0, 1, 2])
3
arr = np.array([[1,2,3,4,5], [6,7,8,9,10]])
print('Last element from 2nd dim: ', arr[1, -1])
OUTPUT:-
Slicing
Slicing allows you to access a subset of an array using the syntax start:stop:step.
If we don't pass start its considered 0
If we don't pass end its considered length of array in that dimension
If we don't pass step its considered 1.
Code:-
import numpy as np
arr = np.array([1, 2, 3, 4, 5, 6, 7])
print(arr[1:5])
print(arr[:4])
print(arr[-3:-1])
print(arr[1:5:2])
print(arr[::2])
arr = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
print(arr[1, 1:4])
print(arr[1, 1:4])
OUTPUT:-
4
print(newarr)
arr = np.array([1, 2, 3, 4, 5, 6, 7, 8])
print(arr.reshape(2, 4).base)
arr = np.array([1, 2, 3, 4, 5, 6, 7, 8])
newarr = arr.reshape(2, 2, -1)
print(newarr)
arr = np.array([[1, 2, 3], [4, 5, 6]])
newarr = arr.reshape(-1)
print(newarr)
OUTPUT:-
5
arr = np.concatenate((arr1, arr2), axis=1)
print(arr)
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])
arr = np.stack((arr1, arr2), axis=1)
print(arr)
arr = np.hstack((arr1, arr2))
print(arr)
arr = np.vstack((arr1, arr2))
print(arr)
arr = np.dstack((arr1, arr2))
print(arr)
OUTPUT:-
6
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
newarr = np.hsplit(arr, 3)
print(newarr)
OUTPUT:-
7
# Cosine of each element
print("Cosine:", np.cos(angles))
# Tangent of each element
print("Tangent:", np.tan(angles))
# Exponential (e^x) of each element
print("Exponential:", np.exp(array))
# Natural logarithm (log base e)
print("Natural Logarithm:",np.log(array))
# Logarithm base 10
print("Logarithm base 10:",np.log10(array))
OUTPUT:-
# Sum of elements
print("Sum:", np.sum(array))
# Mean of elements
print("Mean:", np.mean(array))
# Median of elements
print("Median:", np.median(array))
# Standard deviation of elements
print("Standard Deviation:", np.std(array))
# Minimum and Maximum
print("Min:", np.min(array))
print("Max:", np.max(array))
8
OUTPUT:-
3. Matrix Operations
For 2D arrays (matrices), NumPy provides matrix-specific operations such as dot products,
matrix multiplication, and transposing.
Code:-
import numpy as np
array = np.array([1, 2, 3, 4])
# Matrix multiplication
matrix_a = np.array([[1, 2], [3, 4]])
matrix_b = np.array([[5, 6], [7, 8]])
product = np.dot(matrix_a, matrix_b)
print("Matrix multiplication:\n", product)
# Element-wise multiplication
elementwise_product = matrix_a * matrix_b
print("Element-wise multiplication:\n", elementwise_product)
# Transpose of a matrix
transpose = matrix_a.T
print("Transpose:\n", transpose)
OUTPUT:-
4. Conditional Operations
You can also apply conditional functions on arrays to filter data or perform conditional
computations.
Code:-
import numpy as np
array = np.array([1, 2, 3, 4])
# Conditional selection
9
array = np.array([1, 2, 3, 4, 5])
print("Elements greater than 2:", array[array > 2])
result = np.where(array % 2 == 0, "Even", "Odd")
print("Even or Odd:", result)
OUTPUT:-
5. Broadcasting
NumPy's broadcasting allows operations on arrays of different shapes, which is efficient for
element-wise operations.
Code:-
import numpy as np
array = np.array([1, 2, 3, 4])
# Broadcasting with scalar
array = np.array([[1, 2, 3], [4, 5, 6]])
print("Add 10 to each element:\n", array + 10)
# Broadcasting with another array
array_b = np.array([10, 20, 30])
print("Element-wise addition with broadcasting:\n", array + array_b)
OUTPUT:-
10
OUTPUT:-
Sum
Code:-
# Sum of each column
column_sum = np.sum(data, axis=0)
print("Sum of each column:", column_sum)
# Sum of each row row_sum = np.sum(data, axis=1)
print("Sum of each row:", row_sum)
OUTPUT:-
Standard Deviation
Code:-
# Standard deviation of each column
column_std = np.std(data, axis=0)
print("Standard deviation of each column:", column_std)
# Standard deviation of each row
row_std = np.std(data, axis=1)
print("Standard deviation of each row:", row_std)
OUTPUT:-
11
column_min = np.min(data, axis=0)
print("Min of each column:", column_min)
5} Load an image file and do crop and flip operation using NumPy
Indexing.
A:-
Code:-
12
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
13
WEEK-3,4,5
1)Create Pandas Series and Data Frame from various inputs.
A:- Pandas is a Python library used for working with data sets.
It has functions for analyzing, cleaning, exploring, and manipulating data.
The name "Pandas" has a reference to both "Panel Data", and "Python Data Analysis" and
was created by Wes McKinney in 2008.
Pandas Series
A Pandas Series is like a column in a table.
It is a one-dimensional array holding data of any type.
Code:-
import pandas as pd
a = [1, 7, 2]
myvar = pd.Series(a)
print(myvar)
print(myvar[0])
a = [1, 7, 2]
myvar = pd.Series(a, index = ["x", "y", "z"])
print(myvar)
calories = {"day1": 420, "day2": 380, "day3": 390}
myvar = pd.Series(calories)
print(myvar)
OUTPUT:-
14
Pandas DataFrames
Data sets in Pandas are usually multi-dimensional tables, called DataFrames.
Series is like a column, a DataFrame is the whole table.
A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table
with rows and columns.
Code:-
import pandas as pd
data = {
"calories": [420, 380, 390],
"duration": [50, 40, 45]
}
#load data into a DataFrame object:
df = pd.DataFrame(data)
print(df)
print(df.loc[0])
print(df.loc[[0, 1]])
df = pd.DataFrame(data, index = ["day1", "day2", "day3"])
print(df)
print(df.loc["day2"])
OUTPUT:-
15
2. Import any CSV file to Pandas Data Frame and perform the following?
A:- Let’s assume you have a CSV file named "data.csv".
Code:-
import pandas as pd
# Import CSV file
df = pd.read_csv("/content/data.csv")
(a) Visualize the first and last 10 records
Code:-
# Display first 10 records
print("First 10 records:\n", df.head(10))
# Index of DataFrame
16
print("Index:", df.index)
# Column names
print("Columns:", df.columns)
OUTPUT:-
2. Deleting Columns
Let's say we want to delete the Weight column.
Code:-
# Drop the "Weight" column
df_without_weight = df.drop(columns=["Weight"])
df_without_weight.head()
17
OUTPUT:-
18
OUTPUT:-
co2_stats = {
"Mean CO2": df["CO2"].mean(),
"Sum CO2": df["CO2"].sum(),
"Std Dev CO2": df["CO2"].std(),
"Min CO2": df["CO2"].min(),
"Max CO2": df["CO2"].max(),
}
volume_stats, co2_stats
OUTPUT:-
(f) Find the count and uniqueness of the given categorical values.
Assuming Car is a categorical column, we can find the count and uniqueness of car brands.
Code:-
# Count of unique car brands
unique_car_counts = df["Car"].value_counts()
unique_car_counts
19
# Number of unique car brands
unique_car_count = df["Car"].nunique()
unique_car_count
OUTPUT:-
20
WEEK 6, 7, 8
1. Develop a model on residual analysis of simple linear regression.
Residual analysis is an important step in evaluating the fit of a regression model. It involves examining
the residuals, which are the differences between the observed values and the predicted values from the
regression model. Here’s a guide on how to perform residual analysis using a default dataset, such as the
famous "Iris" dataset, with a focus on a simple linear regression model.
Source code:
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
# Load the Iris dataset
iris = load_iris()
data = pd.DataFrame(data=iris.data, columns=iris.feature_names)
data['species'] = iris.target
# Fit the model (predicting sepal length from sepal width)
X = data['sepal width (cm)']
Y = data['sepal length (cm)']
X = sm.add_constant(X) # Adds an intercept term
model = sm.OLS(Y, X).fit()
# Display the model summary
print(model.summary())
# Calculate predicted values and residuals
data['predicted'] = model.predict(X)
data['residuals'] = data['sepal length (cm)'] - data['predicted']
# Residual plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data['predicted'], y=data['residuals'])
21
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs Predicted Values')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()
# Check for normality: Histogram of residuals
plt.figure(figsize=(10, 6))
sns.histplot(data['residuals'], kde=True)
plt.title('Residuals Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()
# Q-Q plot for normality of residuals
sm.qqplot(data['residuals'], line='s')
plt.title('Q-Q Plot of Residuals')
plt.show()
# Calculate leverage
leverage = model.get_influence().hat_matrix_diag
# Leverage vs Residuals plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=leverage, y=data['residuals'])
plt.axhline(0, color='red', linestyle='--')
plt.title('Leverage vs Residuals')
plt.xlabel('Leverage')
plt.ylabel('Residuals')
plt.show()
22
Output:
23
2. Residual plots of linear regression
Residual Plots: Definition
Residual Plots are graphical representations used to diagnose the fit of a regression model by
displaying the residuals (the differences between observed and predicted values) against
predicted values or another variable. They help assess whether the assumptions of regression
analysis (linearity, homoscedasticity, independence, and normality of residuals) are met.
Key Components of Residual Plots
Residuals: The vertical distances between the observed data points and the regression line,
calculated as:
Horizontal Line at Zero: Indicates where the residuals are equal to zero, which is the ideal
scenario for a perfect model fit.
Axes:
X-axis: Often shows predicted values or one of the independent variables.
Y-axis: Displays residuals.
Purpose of Residual Plots
Identify Patterns: To detect non-linearity, if residuals show a systematic pattern rather than
random scatter.
Check Homoscedasticity: To assess if residuals have constant variance. A funnel shape indicates
heteroscedasticity.
Detect Outliers and Influential Points: Outliers can significantly affect the regression model.
Source code:
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
# Load the California housing dataset
california_housing = fetch_california_housing()
data = pd.DataFrame(data=california_housing.data, columns=california_housing.feature_names)
data['median_house_value'] = california_housing.target
24
# Select features and target variable
X = data['MedInc'] # Corrected to 'MedInc'
Y = data['median_house_value']
# Add a constant term for the intercept
X = sm.add_constant(X)
# Fit the linear regression model
model = sm.OLS(Y, X).fit()
# Display the model summary
print(model.summary())
# Calculate predicted values and residuals
data['predicted'] = model.predict(X)
data['residuals'] = data['median_house_value'] - data['predicted']
# Residual plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data['predicted'], y=data['residuals'])
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs Predicted Values')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()
# Check for normality: Histogram of residuals
plt.figure(figsize=(10, 6))
sns.histplot(data['residuals'], kde=True)
plt.title('Residuals Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()
# Q-Q plot for normality of residuals
25
sm.qqplot(data['residuals'], line='s')
plt.title('Q-Q Plot of Residuals')
plt.show()
# Calculate leverage
leverage = model.get_influence().hat_matrix_diag
# Leverage vs Residuals plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=leverage, y=data['residuals'])
plt.axhline(0, color='red', linestyle='--')
plt.title('Leverage vs Residuals')
plt.xlabel('Leverage')
plt.ylabel('Residuals')
plt.show()
Output:
26
3. Normal probability plots.
A normal probability plot (also known as a Q-Q plot) is a graphical tool used to assess whether a
set of data follows a normal distribution. In the context of residual analysis in regression, it's
particularly useful for checking the normality of residuals.
Source code:
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
# Load the California housing dataset
california_housing = fetch_california_housing()
data = pd.DataFrame(data=california_housing.data, columns=california_housing.feature_names)
data['median_house_value'] = california_housing.target
# Select features and target variable
X = data['MedInc'] # Median income
Y = data['median_house_value']
# Add a constant term for the intercept
X = sm.add_constant(X)
# Fit the linear regression model
model = sm.OLS(Y, X).fit()
# Calculate residuals
27
data['residuals'] = Y - model.predict(X)
# Q-Q plot for normality of residuals
plt.figure(figsize=(10, 6))
sm.qqplot(data['residuals'], line='s')
plt.title('Q-Q Plot of Residuals')
plt.show()
Output:
28
data = pd.DataFrame(data=california_housing.data, columns=california_housing.feature_names)
data['median_house_value'] = california_housing.target
# Select features and target variable
X = data[['MedInc', 'HouseAge', 'AveRooms']] # Multiple independent variables
Y = data['median_house_value']
# Add a constant term for the intercept
X = sm.add_constant(X)
# Calculate predicted values and residuals
data['predicted'] = model.predict(X)
data['residuals'] = Y - data['predicted']
# Residual plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data['predicted'], y=data['residuals'])
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs Predicted Values')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()
# Q-Q plot for normality of residuals
plt.figure(figsize=(10, 6))
sm.qqplot(data['residuals'], line='s')
plt.title('Q-Q Plot of Residuals')
plt.show()
29
Output:
30
WEEK 9, 10, 11
1. Import any CSV file to Pandas Data Frame and perform the following:
(a) Handle missing data by detecting and dropping/ filling missing values.
Source code:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
# Load the California housing dataset
california_housing = fetch_california_housing()
data = pd.DataFrame(data=california_housing.data, columns=california_housing.feature_names)
data['median_house_value'] = california_housing.target
# Simulate missing data for demonstration purposes
data.loc[0:10, 'MedInc'] = np.nan # Introduce NaN values
print("Initial DataFrame:")
print(data.head())
Output:
31
# Fill missing values with the median
data['MedInc'].fillna(data['MedInc'].median(), inplace=True)
print("\nMissing values after handling:")
print(data.isnull().sum())
Output:
32
(51, np.inf): '51+'
}
def categorize_age(age):
for age_range, label in age_mapping.items():
if age_range[0] <= age <= age_range[1]:
return label
return 'Unknown'
# Use map() to apply the categorization
data['HouseAge_Category'] = data['HouseAge'].map(categorize_age)
# Result
33
Output:
Original dataset shape: (20640, 10)
Filtered dataset shape: (19569, 10)
Source code:
import matplotlib.pyplot as plt
import seaborn as sns
# Box plot to visualize the median house value
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['median_house_value'])
plt.title('Box plot of Median House Value')
plt.show()
Output:
34
# Simulate missing data for demonstration purposes
data.loc[0:10, 'MedInc'] = np.nan # Introduce NaN values
# Create a sample string column with a shorter name 's'
descriptions = [
'Affordable housing in California',
'Luxury homes with ocean view',
'Cozy cottage near the mountains',
'Modern apartments in urban area',
'Spacious villas in suburbs',
'Renovated houses with large gardens',
'New constructions with smart features',
'Historic homes with character',
'Townhouses close to amenities',
'Single-family homes with yards',
'Condos with stunning views',
'Eco-friendly houses'
]
# Repeat the descriptions to match the length of the DataFrame
data['s'] = (descriptions * (len(data) // len(descriptions) + 1))[:len(data)]
print("Initial DataFrame:")
print(data.head())
# Vectorized String Operations
data['s_lower'] = data['s'].str.lower() # Convert to lowercase
data['has_affordable'] = data['s'].str.contains('affordable', case=False) # Check for substring
data['s_length'] = data['s'].str.len() # Get length of each description
data['s_replaced'] = data['s'].str.replace('California', 'CA', regex=False) # Replace substring
data['count_e'] = data['s'].str.count('e') # Count occurrences of 'e'
35
# Display the modified DataFrame
print("\nModified DataFrame with String Operations:")
print(data[['s', 's_lower', 'has_affordable', 's_length', 's_replaced', 'count_e']].head())
Output:
36
# Load the California housing dataset
california_housing = fetch_california_housing()
data = pd.DataFrame(data=california_housing.data, columns=california_housing.feature_names)
data['median_house_value'] = california_housing.target
# Simulate missing data for demonstration purposes
data.loc[0:10, 'MedInc'] = np.nan # Introduce NaN values
# Fill missing values (simple strategy)
data['MedInc'].fillna(data['MedInc'].mean(), inplace=True)
# Features and target variable
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train Ridge Regression Model
ridge_model = Ridge(alpha=1.0) # Regularization strength
ridge_model.fit(X_train_scaled, y_train)
# Predict and evaluate Ridge model
ridge_predictions = ridge_model.predict(X_test_scaled)
ridge_mse = mean_squared_error(y_test, ridge_predictions)
print(f'Ridge Regression Mean Squared Error: {ridge_mse:.2f}')
# Train Lasso Regression Model
lasso_model = Lasso(alpha=1.0) # Regularization strength
lasso_model.fit(X_train_scaled, y_train)
37
# Predict and evaluate Lasso model
lasso_predictions = lasso_model.predict(X_test_scaled)
lasso_mse = mean_squared_error(y_test, lasso_predictions)
print(f'Lasso Regression Mean Squared Error: {lasso_mse:.2f}')
# Setting the style for the plots
sns.set(style="whitegrid")
# 1. Scatter Plot of Predictions vs. Actual Values
plt.figure(figsize=(12, 6))
# Ridge Regression Predictions
plt.subplot(1, 2, 1)
plt.scatter(y_test, ridge_predictions, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.title('Ridge Regression: Predictions vs Actual')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
# Lasso Regression Predictions
plt.subplot(1, 2, 2)
plt.scatter(y_test, lasso_predictions, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.title('Lasso Regression: Predictions vs Actual')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.tight_layout()
plt.show()
# 2. Bar Plot of Coefficients
coefficients = pd.DataFrame({
'Feature': X.columns,
'Ridge Coefficients': ridge_model.coef_,
38
'Lasso Coefficients': lasso_model.coef_
})
# Melt the DataFrame for better plotting
coefficients_melted = coefficients.melt(id_vars='Feature', var_name='Model',
value_name='Coefficient')
# Bar Plot
plt.figure(figsize=(12, 6))
sns.barplot(x='Feature', y='Coefficient', hue='Model', data=coefficients_melted)
plt.title('Ridge and Lasso Regression Coefficients')
plt.xticks(rotation=45)
plt.axhline(0, color='grey', linestyle='--')
plt.show()
Output:
39
(e) Develop a model on logistic regression on any data set for prediction.
Source code:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
# Load the California housing dataset
california_housing = fetch_california_housing()
data = pd.DataFrame(data=california_housing.data, columns=california_housing.feature_names)
data['median_house_value'] = california_housing.target
# Create a binary target variable (1 if house value > threshold, else 0)
threshold = 2.5 # You can adjust this threshold based on your needs
data['high_value'] = (data['median_house_value'] > threshold).astype(int)
# Features and target variable
X = data.drop(['median_house_value', 'high_value'], axis=1)
y = data['high_value']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the Logistic Regression Model
logistic_model = LogisticRegression(max_iter=200)
logistic_model.fit(X_train, y_train)
# Predict on the test set
y_pred = logistic_model.predict(X_test)
# Evaluate the model
40
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
xticklabels=['Low Value', 'High Value'],
yticklabels=['Low Value', 'High Value'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Output:
41