Slip Clustering
Slip Clustering
# normalize data
from sklearn.cluster import KMeans
cluster_range = range(1, 10)
cluster_errors = []
#elbow method to find best no. of clusters
for num_clusters in cluster_range:
clusters = KMeans(num_clusters, n_init="auto")
clusters.fit(scaled_df)
cluster_errors.append(clusters.inertia_)
plt.plot(cluster_range, cluster_errors, marker = "o")
#got k from plot
k=3
clusters = KMeans(k, random_state = 42, n_init="auto")
clusters.fit(scaled_df)
df["clusterid"] = clusters.labels_
# clusters.cluster_centers_
# clusters.labels_
#ROC Curve
from sklearn import metrics
def draw_roc_curve( model, test_X, test_y ):
test_results_df = pd.DataFrame( { 'actual': test_y } )
test_results_df = test_results_df.reset_index()
predict_proba_df = pd.DataFrame( model.predict_proba( test_X.values
))
test_results_df['chd_1'] = predict_proba_df.iloc[:,1:2]
fpr, tpr, thresholds = metrics.roc_curve(
test_results_df.actual,test_results_df.chd_1,drop_intermediate = False )
auc_score = metrics.roc_auc_score( test_results_df.actual,
test_results_df.chd_1 )
plt.plot(fpr, tpr, label = 'ROC curve (area = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
#PCA
from sklearn.preprocessing import StandardScaler
from numpy.linalg import eig
X_norm = StandardScaler().fit_transform(X)
X_norm = pd.DataFrame(X_norm, columns=feat)
X_mean_adj = X_norm - X_norm.mean()
cov_mat = X_mean_adj.cov()
val, vec = eig(cov_mat)
val = np.array(val)
vec = np.array(vec)
sorted_idx = np.argsort(-val)
val = val[sorted_idx]
vec = vec[:, sorted_idx]
pc1 = np.dot(vec[:, 0], X_mean_adj.T)