""" Felix Muzny DS 2000 Lecture 21 November 22, 2024 - doing some nearest-neighbors machine learning on movies - distance metrics """ import csv import matplotlib.pyplot as plt FILENAME = "movies_train.csv" K = 3 FEATURES = ["funny", "scary"] LABEL_NAME = "label" FIRST_FEATURE = "funny" SECOND_FEATURE = "scary" def read_csv(filename, float_features = FEATURES, label_name = LABEL_NAME): """ Read the csv in, converting the columns in float_features to floats and label_name to an int; leaving all other columns as strings :param filename: str :param float_features: list of str column names :param label_name: str name of column with integer label :return: list of dicts """ data = [] with open(filename, "r") as file: reader = csv.DictReader(file) for line in reader: # clean the data for float_feat in float_features: line[float_feat] = float(line[float_feat]) # make label into an int line[label_name] = int(line[label_name]) data.append(line) return data def get_movie_user(): """ Get a new data point to label! Asks the user to input movie name, imdb rating, funny, and scary :return: dictionary """ title = input("What movie title? ") funny = float(input("How funny? (1 - 10) ")) scary = float(input("How scary? (1 - 10) ")) new_movie = {"title": title, "funny": funny, "scary": scary} return new_movie def visualize(target_movie, train_data, first_feat=FIRST_FEATURE, second_feat=SECOND_FEATURE, label_name=LABEL_NAME): """ Make a scatter plot visualizing the training data and the new (unlabeled) data point :param target_movie: dictionary representing unlabeled new movie :param train_data: list of dicts :param first_feat: str :param second_feat: str :param label_name: str :return: none """ xs = [] ys = [] colors = [] for datapoint in train_data: xs.append(datapoint[first_feat]) ys.append(datapoint[second_feat]) # color the data points by their label if datapoint[label_name] == 1: colors.append("red") else: colors.append("blue") # challenge problem: also label each data point with the title of # the associated movie! plt.scatter(xs, ys, c=colors) plt.plot(target_movie[first_feat], target_movie[second_feat], "*", color="black") plt.xlabel(first_feat) plt.ylabel(second_feat) plt.show() def distance(movie1, movie2, first_feat = FIRST_FEATURE, second_feat = SECOND_FEATURE): """ Calculate the euclidian distance between two movies based off of two features. :param movie1: dictionary with given features :param movie2: dictionary with given features :param first_feat: string feature name :param second_feat: string feature name :return: float distance between movies """ total_distance = (movie1[first_feat] - movie2[first_feat]) ** 2 total_distance += (movie1[second_feat] - movie2[second_feat]) ** 2 # finally, take the square root return total_distance ** 0.5 def nearest_neighbors(target_movie, train_movies, k): """ Find the k nearest neighbors to the target movie :param target_movie: dictionary :param train_movies: list of dictionaries :param k: int number of nearest neighbors :return: list of string titles of nearest movies """ # make a list of lists with each element being # [distance, title] # so we can easily sort later dist_list = [] # find the distance from this movie to all others for movie in train_movies: # distance between target and movie dist = distance(target_movie, movie) dist_list.append([dist, movie["title"]]) # find the k movies with min distance # sort sorts by the first element in the sub-lists by default dist_list.sort() k_closest = dist_list[:k] # return these titles # but do this with ~list comprehension~ # # equivalent to: # k_closest_titles = [] # for item in k_closest_titles: # k_closest_titles.append(item[1]) k_closest_titles = [item[1] for item in k_closest] return k_closest_titles def predict(movie_titles, train_movies, label_name = LABEL_NAME): """ Predict a label based on a majority vote from the identified movie titles :param movie_titles: list of strings :param train_movies: list of dictionary movie data points :param label_name: string name of field where labels can be found :return: int """ label1_count = 0 label0_count = 0 for movie in train_movies: # is this one of the nearest movies? if movie["title"] in movie_titles: if movie[label_name] == 1: label1_count += 1 else: label0_count += 1 if label1_count > label0_count: return 1 else: return 0 def main(): # read in the training data train = read_csv(FILENAME) print("First train example:", train[0]) # get a movie from the user user_movie = get_movie_user() # take a look at what everything looks like with the visualize function visualize(user_movie, train) # tell the user if you think they will like it! nearest = nearest_neighbors(user_movie, train, K) print("Closest movies from the training data:", nearest) print() # now take a vote resulting_label = predict(nearest, train) print("Do we think you'll like it?") print("Predicted label (0 means no, 1 means yes):", resulting_label) main()