""" Felix Muzny November 17/18, 2021 DS 2001 - CS Practicum #9 k-means clustering for 2-dimensional points! """ import kmeans_utils as utils import matplotlib.pyplot as plt import math # constant for Task 4 ITERATIONS = 3 def distance(p1, p2): """ name: distance This function takes two 2-d points and calculates the Euclidian distance between them. parameters: point1 - list of two numbers point2 - list of two numbers return: float euclidian distance between the two points (x1, y1) and (x2, y2) """ d = math.sqrt((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2) return d def closest(target_point, point_list): """ name: closest This function takes a target point and a list of points and calculates the Euclidian distance between them to find the point in the list that is closest to the target. parameters: target_point - list of a 2d point point_list - list of lists of 2d points return: the tuple point from point_list that is closest to target_point """ closest = None current_min = -1 for point in point_list: d = distance(target_point, point) if d < current_min or closest is None: closest = point current_min = d return closest def graph_clusters(clusters, title, iteration, new_centroids = None): """ name: graph_clusters This function displays a scatter plot of clusters with their assigned centroids. Optionally also displays new centroids. parameters: clusters - dict of centroids (tuples) mapping to the data points assigned to them title - str base of the title to display iteration - int iteration number to display new_centroids - list of new centroids (tuples) to display. Default value None (undisplayed) return: None """ for cluster in clusters: plt.scatter(cluster[0], cluster[1], label=str(cluster), marker="s") # plot the series of points xs = utils.get_column(clusters[cluster], 0) ys = utils.get_column(clusters[cluster], 1) plt.scatter(xs, ys, label=str(cluster)) if new_centroids is not None: xs = utils.get_column(centroids, 0) ys = utils.get_column(centroids, 1) plt.scatter(xs, ys, label="new centroids", marker="s") plt.xlabel("x value") plt.ylabel("y value") plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) title = title + ", iteration: " + str(iteration) plt.title(title) plt.savefig(title.replace(" ", "_").replace(",", "").replace(":", "") + ".pdf", bbox_inches="tight") plt.show() if __name__ == "__main__": """ From Part 1 """ # Task 1 animal_ratings = {"cat": 4, "dog":4, "whale": 100} print(animal_ratings) # Task 2 points = utils.generate_points(20) centroids = utils.generate_points(2) print("data:", points) print("centroids:", centroids) print() # graphing our data plt.scatter(utils.get_column(points, 0), utils.get_column(points, 1), label="data") plt.scatter(utils.get_column(centroids, 0), utils.get_column(centroids, 1), label="centroids") plt.xlabel("x value") plt.ylabel("y value") plt.title("initial data for k-means clustering") plt.legend() plt.savefig("initialdata.pdf", bbox_inches="tight") plt.show() # Testing task 3 print("Testing distance function") print("distance from (0, 0) to (1, 1):", distance((0, 0), (1, 1))) print("distance from (-10, 0) to (10, 0):", distance((-10, 0), (10, 0))) print() target = [1, 1] # Testing task 4 print("Testing closest function") print("closest centroid to (0, 0):", closest((0, 0), centroids)) print("closest centroid to (10, 10):", closest((10, 10), centroids)) print() # Task 5 clusters = {} for point in points: belongs_to = closest(point, centroids) if belongs_to not in clusters: clusters[belongs_to] = 0 clusters[belongs_to] += 1 print("counts of points assigned to clusters") print(clusters) """ Additions for part 2 """ # Task 4 # Task 5 makes this a while loop while change > 0 # rather than a fixed number of iterations change = 100 i = 1 while change > 0: # help visually distinguish between iterations print("****************") print("Iteration:", i) # Task 1 # assign points to clusters clusters = {} for point in points: belongs_to = closest(point, centroids) if belongs_to not in clusters: clusters[belongs_to] = [] clusters[belongs_to].append(point) print("points assigned to clusters:") print(clusters) # Task 2 graph_clusters(clusters, "assign to clusters step", i) # Task 3 # choose better centroids # Task 5 # adding how much did the centroids change? change = 0 centroids = [] for cluster in clusters: # calculate the mean of the cluster xs = utils.get_column(clusters[cluster], 0) ys = utils.get_column(clusters[cluster], 1) centroid = (sum(xs) / len(xs), sum(ys) / len(ys)) centroids.append(centroid) # Task 5 change_x = abs(cluster[0] - centroid[0]) change_y = abs(cluster[1] - centroid[1]) change += change_x + change_y print("updated centroids:", centroids) graph_clusters(clusters, "reassign centroids step", i, centroids) print("change in centroids:", change) print() i += 1 print("done!")