Experiment 5
Experiment 5
Experiment 5
Use the diabetes data set from UCI and Pima Indians Diabetes data set for performing the
following:
a. Univariate analysis: Frequency, Mean, Median, Mode, Variance, Standard Deviation,
Skewness and Kurtosis.
b. Bivariate analysis: Linear and logistic regression modeling.
c. Multiple Regression analysis.
d. Also compare the results of the above analysis for the two data sets
Code:
a) import pandas as pd
import scipy.stats as sp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,
classification_report
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import warnings as warn
warn.filterwarnings("ignore")
df=pd.read_csv('diabetes.csv')
print("Frequency:\n", df.value_counts())
print("Mean:\n", df.mean)
print("Median:\n", df.median)
print("Mode:\n", df.mode)
print("Variance:\n", df.var)
print("Standard deviation:\n", df.std)
print("Skewness:\n", sp.skew(df))
print("Kurtosis:\n", sp.kurtosis(df))
Output:
b) Code:
#Linear regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Load your dataset
data = pd.read_csv('diabetes.csv')
# Select independent (X) and dependent (y) variables
X = data[['Age']]
y = data['BloodPressure']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 42)
# Create a linear regression model
model = LinearRegression()
# Fit the model to the training data
model.fit(X_train, y_train)
# Make predictions on the test data
predictions = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error:\n',mse)
# Plot the regression line
plt.scatter(X_test, y_test, color = 'black')
plt.plot(X_test, predictions, color = 'blue', linewidth = 3)
plt.xlabel('Independent Variable')
plt.ylabel('Dependent Variable')
plt.title('Linear Regression')
plt.show()
#logistic regression
# Load your dataset
data = pd.read_csv('diabetes.csv')
# Select independent (X) and dependent (y) variables
X = data[['Age']]
y = data['BloodPressure']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state = 42)
# Create a logistic regression model
model = LogisticRegression()
# Fit the model to the training data
model.fit(X_train, y_train)
# Make predictions on the test data
predictions = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print('Accuracy: ', accuracy)
# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = 'grey', cbar =
False)
plt.xlabel('Pre4dicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()
Output:
c) Code:
X_multi = sm.add_constant(X_test)
multi_model = sm.OLS(y_test, X_multi).fit()
print("\n3. )Multiple Regression: ")
# Display multiple regression results
print("\nMultiple Regression Results:")
print(multi_model.summary())
Output:
d) Code:
data2=pd.read_csv('d2.csv')
result = data.compare(data2)
print("comparision of two datasets ")
print(result)
Output: