'''
    DS2500
    Spring 2025
    Sample code from lecture 4/1/25

    Next week! - trying some scikit learn on top of it
    K-Means Clustering
    Today - writing kmeans by hand

    Overview/big picture:
    - read in data, create a dataframe
    - pick k rows at random to be centroids
    - compute distance from every object to evey centroid
    - assign each object to its closest centroid
    - re-compute centroids (mean of each cluster) and go again

    centroid = slimy/kill human values (one row)
    cluster = 0, 1, 2, ..., k-1
'''

import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import seaborn as sns

K = 5
NUMERIC = ["slimy", "kill humans"]
ALIEN_FILE = "aliens.json"

def select_centroids(df, k = K):
    ''' given a dataframe of training data, and a k
        (number of centroids), randomly choose k
        row from the dataframe and return
    '''
    centroids = df.sample(k)
    return centroids

def find_closest_centroid(object, centroids):
    ''' given a single object (series) and all K centroids (df),
        compute distance to all centroids, return the index of the
        closest one (cluster)
    '''
    distances = cdist(centroids.values, object.values.reshape(1, -1), metric = "euclidean")
    min_index = np.argmin(distances)
    return min_index

def assign_to_cluster(df, centroids):
    ''' given a dataframe of objects and a dataframe of centroids,
        find the closest cnetoird to each object and assign
        to that cluster. Return a list of cluster assignments [0, 1, 2..., k]
    '''
    cluster_assgs = []
    for index, row in df.iterrows():
        cluster = find_closest_centroid(row, centroids)
        cluster_assgs.append(cluster)
    return cluster_assgs

def plot_clusters(df, x, y, cluster = "cluster"):
    ''' given a dataframe plot x, y in a scatterplot,
        colored by cluster column
    '''
    sns.scatterplot(x = x, y = y, data = df, hue = cluster, palette = "flare",
                    s = 100)
    plt.show()

def recompute_centroids(df, numeric = NUMERIC, cluster = "cluster"):
    ''' given a dataframe, list of feature names,
        and the name of its cluster column, re-compute the centroids
    '''
    centroids = df.groupby(cluster)[numeric].mean().reset_index()
    return centroids[numeric]

def main():
    # step one: gather data by reading from json file into dataframe
    df = pd.read_json(ALIEN_FILE)
    df = pd.json_normalize(df["results"])
    print(df.columns)

    # get our randomly chosen centroids
    centroids = select_centroids(df, K)
    print(centroids["name"])
    centroids = centroids[NUMERIC]
    print(centroids)

    # normally this would be a while loop, until clusters stabilize
    # but for a small dataset this should be enough to settle
    for _ in range(5):
        # assign objects to clusters
        clusters = assign_to_cluster(df[NUMERIC], centroids)
        df["cluster"] = clusters
        print(df.head(5))

        # plot what we have so far...
        plot_clusters(df, NUMERIC[0], NUMERIC[1], "cluster")

        # re-compute centroids
        centroids = recompute_centroids(df)

    # what clusters do aliens get assigned to
    # in the end?
    print(df)

if __name__ == "__main__":
    main()