Experiment 5
Experiment 5
Experiment 5
Use the diabetes data set from UCI and Pima Indians Diabetes data set for performing the
a. Univariate analysis: Frequency, Mean, Median, Mode, Variance, Standard Deviation,
Skewness and Kurtosis.
b. Bivariate analysis: Linear and logistic regression modeling.
c. Multiple Regression analysis.
d. Also compare the results of the above analysis for the two data sets
a) import pandas as pd
import scipy.stats as sp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import warnings as warn
print("Frequency:\n", df.value_counts())
print("Mean:\n", df.mean)
print("Median:\n", df.median)
print("Mode:\n", df.mode)
print("Variance:\n", df.var)
print("Standard deviation:\n", df.std)
print("Skewness:\n", sp.skew(df))
print("Kurtosis:\n", sp.kurtosis(df))
b) Code:
#Linear regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Load your dataset
data = pd.read_csv('diabetes.csv')
# Select independent (X) and dependent (y) variables
X = data[['Age']]
y = data['BloodPressure']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 42)
# Create a linear regression model
model = LinearRegression()
# Fit the model to the training data
model.fit(X_train, y_train)
# Make predictions on the test data
predictions = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error:\n',mse)
# Plot the regression line
plt.scatter(X_test, y_test, color = 'black')
plt.plot(X_test, predictions, color = 'blue', linewidth = 3)
plt.xlabel('Independent Variable')
plt.ylabel('Dependent Variable')
plt.title('Linear Regression')
#logistic regression
# Load your dataset
data = pd.read_csv('diabetes.csv')
# Select independent (X) and dependent (y) variables
X = data[['Age']]
y = data['BloodPressure']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 42)
# Create a logistic regression model
model = LogisticRegression()
# Fit the model to the training data
model.fit(X_train, y_train)
# Make predictions on the test data
predictions = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print('Accuracy: ', accuracy)
# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = 'grey', cbar =
plt.title('Confusion Matrix - Logistic Regression')
c) Code:
X_multi = sm.add_constant(X_test)
multi_model = sm.OLS(y_test, X_multi).fit()
print("\n3. )Multiple Regression: ")
# Display multiple regression results
print("\nMultiple Regression Results:")
d) Code:
result = data.compare(data2)
print("comparision of two datasets ")