''' DS2500 Spring 2025 Sample code from lecture 4/1/25 Next week! - trying some scikit learn on top of it K-Means Clustering Today - writing kmeans by hand Overview/big picture: - read in data, create a dataframe - pick k rows at random to be centroids - compute distance from every object to evey centroid - assign each object to its closest centroid - re-compute centroids (mean of each cluster) and go again centroid = slimy/kill human values (one row) cluster = 0, 1, 2, ..., k-1 ''' import pandas as pd import numpy as np from scipy.spatial.distance import cdist import matplotlib.pyplot as plt import seaborn as sns K = 5 NUMERIC = ["slimy", "kill humans"] ALIEN_FILE = "aliens.json" def select_centroids(df, k = K): ''' given a dataframe of training data, and a k (number of centroids), randomly choose k row from the dataframe and return ''' centroids = df.sample(k) return centroids def find_closest_centroid(object, centroids): ''' given a single object (series) and all K centroids (df), compute distance to all centroids, return the index of the closest one (cluster) ''' distances = cdist(centroids.values, object.values.reshape(1, -1), metric = "euclidean") min_index = np.argmin(distances) return min_index def assign_to_cluster(df, centroids): ''' given a dataframe of objects and a dataframe of centroids, find the closest cnetoird to each object and assign to that cluster. Return a list of cluster assignments [0, 1, 2..., k] ''' cluster_assgs = [] for index, row in df.iterrows(): cluster = find_closest_centroid(row, centroids) cluster_assgs.append(cluster) return cluster_assgs def plot_clusters(df, x, y, cluster = "cluster"): ''' given a dataframe plot x, y in a scatterplot, colored by cluster column ''' sns.scatterplot(x = x, y = y, data = df, hue = cluster, palette = "flare", s = 100) plt.show() def recompute_centroids(df, numeric = NUMERIC, cluster = "cluster"): ''' given a dataframe, list of feature names, and the name of its cluster column, re-compute the centroids ''' centroids = df.groupby(cluster)[numeric].mean().reset_index() return centroids[numeric] def main(): # step one: gather data by reading from json file into dataframe df = pd.read_json(ALIEN_FILE) df = pd.json_normalize(df["results"]) print(df.columns) # get our randomly chosen centroids centroids = select_centroids(df, K) print(centroids["name"]) centroids = centroids[NUMERIC] print(centroids) # normally this would be a while loop, until clusters stabilize # but for a small dataset this should be enough to settle for _ in range(5): # assign objects to clusters clusters = assign_to_cluster(df[NUMERIC], centroids) df["cluster"] = clusters print(df.head(5)) # plot what we have so far... plot_clusters(df, NUMERIC[0], NUMERIC[1], "cluster") # re-compute centroids centroids = recompute_centroids(df) # what clusters do aliens get assigned to # in the end? print(df) if __name__ == "__main__": main()