import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.metrics import silhouette_samples, silhouette_score # Read data kdata = pd.read_csv("gre_ex.csv", sep=';') # Plot the data fig, ax = plt.subplots() ax.scatter(kdata.iloc[:, 0], kdata.iloc[:, 1]) plt.xlabel(kdata.columns.values[0]) plt.ylabel(kdata.columns.values[1]) # Apply kmeans kmeans = KMeans(n_clusters=4) kmeans.fit_predict(kdata) labels = kmeans.labels_ centers = kmeans.cluster_centers_ # Plot clustering result fig, ax = plt.subplots() ax.scatter(kdata.iloc[:, 0], kdata.iloc[:, 1], c=labels) centers = kmeans.cluster_centers_ plt.scatter(centers[:, 0], centers[:, 1], c='black', marker='+', s=100); plt.xlabel(kdata.columns.values[0]) plt.ylabel(kdata.columns.values[1]) # Silhouette silhouette = silhouette_score(kdata, labels) print(silhouette) # Silhouette plot silhouette_values = silhouette_samples(kdata, labels) fig, ax = plt.subplots() y_lower = 10 for i in range(len(centers)): ith_cluster_silhouette_values = silhouette_values[labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values) ax.text(-0.025, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 ax.axvline(x=silhouette, color="red", linestyle="--") ax.set_yticks([]) #plt.show() # Select number of clusters fig, ax = plt.subplots() cluster_nums = [2, 3, 4, 5, 6, 7, 8] silhouettes = [] for n_clusters in cluster_nums: kmeans = KMeans(n_clusters=n_clusters) kmeans.fit_predict(kdata) labels = kmeans.labels_ silhouettes.append(silhouette_score(kdata, labels)) ax.plot(cluster_nums, silhouettes) ax.set_xlabel("Number of clusters") ax.set_ylabel("Silhouette") plt.show()