"""
Felix Muzny
DS 2000
Lecture 21
November 22, 2024

- doing some nearest-neighbors machine learning on movies
- distance metrics
"""

import csv
import matplotlib.pyplot as plt

FILENAME = "movies_train.csv"
K = 3
FEATURES = ["funny", "scary"]
LABEL_NAME = "label"
FIRST_FEATURE = "funny"
SECOND_FEATURE = "scary"

def read_csv(filename, float_features = FEATURES, label_name = LABEL_NAME):
    """
    Read the csv in, converting the columns in float_features to floats and label_name
    to an int; leaving all other columns as strings
    :param filename: str
    :param float_features: list of str column names
    :param label_name: str name of column with integer label
    :return: list of dicts
    """
    data = []
    with open(filename, "r") as file:
        reader = csv.DictReader(file)
        for line in reader:
            # clean the data
            for float_feat in float_features:
                line[float_feat] = float(line[float_feat])
            # make label into an int
            line[label_name] = int(line[label_name])

            data.append(line)

    return data

def get_movie_user():
    """
    Get a new data point to label!
    Asks the user to input movie name, imdb rating, funny, and scary
    :return: dictionary
    """
    title = input("What movie title? ")
    funny = float(input("How funny? (1 - 10) "))
    scary = float(input("How scary? (1 - 10) "))
    new_movie = {"title": title, "funny": funny, "scary": scary}
    return new_movie

def visualize(target_movie, train_data, first_feat=FIRST_FEATURE, second_feat=SECOND_FEATURE, label_name=LABEL_NAME):
    """
    Make a scatter plot visualizing the training data and the new (unlabeled) data point
    :param target_movie: dictionary representing unlabeled new movie
    :param train_data: list of dicts
    :param first_feat: str
    :param second_feat: str
    :param label_name: str
    :return: none
    """
    xs = []
    ys = []
    colors = []
    for datapoint in train_data:
        xs.append(datapoint[first_feat])
        ys.append(datapoint[second_feat])

        # color the data points by their label
        if datapoint[label_name] == 1:
            colors.append("red")
        else:
            colors.append("blue")

    # challenge problem: also label each data point with the title of
    # the associated movie!

    plt.scatter(xs, ys, c=colors)
    plt.plot(target_movie[first_feat], target_movie[second_feat], "*", color="black")
    plt.xlabel(first_feat)
    plt.ylabel(second_feat)
    plt.show()

def distance(movie1, movie2, first_feat = FIRST_FEATURE, second_feat = SECOND_FEATURE):
    """
    Calculate the euclidian distance between two movies based off of two features.
    :param movie1: dictionary with given features
    :param movie2: dictionary with given features
    :param first_feat: string feature name
    :param second_feat: string feature name
    :return: float distance between movies
    """
    total_distance = (movie1[first_feat] - movie2[first_feat]) ** 2
    total_distance += (movie1[second_feat] - movie2[second_feat]) ** 2
    # finally, take the square root
    return total_distance ** 0.5

def nearest_neighbors(target_movie, train_movies, k):
    """
    Find the k nearest neighbors to the target movie
    :param target_movie: dictionary
    :param train_movies: list of dictionaries
    :param k: int number of nearest neighbors
    :return: list of string titles of nearest movies
    """
    # make a list of lists with each element being
    # [distance, title]
    # so we can easily sort later
    dist_list = []
    # find the distance from this movie to all others
    for movie in train_movies:
        # distance between target and movie
        dist = distance(target_movie, movie)
        dist_list.append([dist, movie["title"]])

    # find the k movies with min distance
    # sort sorts by the first element in the sub-lists by default
    dist_list.sort()
    k_closest = dist_list[:k]

    # return these titles
    # but do this with ~list comprehension~
    #
    # equivalent to:
    # k_closest_titles = []
    # for item in k_closest_titles:
    #     k_closest_titles.append(item[1])
    k_closest_titles = [item[1] for item in k_closest]
    return k_closest_titles

def predict(movie_titles, train_movies, label_name = LABEL_NAME):
    """
    Predict a label based on a majority vote from the identified movie titles
    :param movie_titles: list of strings
    :param train_movies: list of dictionary movie data points
    :param label_name: string name of field where labels can be found
    :return: int
    """
    label1_count =  0
    label0_count = 0
    for movie in train_movies:
        # is this one of the nearest movies?
        if movie["title"] in movie_titles:
            if movie[label_name] == 1:
                label1_count += 1
            else:
                label0_count += 1
    if label1_count > label0_count:
        return 1
    else:
        return 0


def main():
    # read in the training data
    train = read_csv(FILENAME)
    print("First train example:", train[0])

    # get a movie from the user
    user_movie = get_movie_user()

    # take a look at what everything looks like with the visualize function
    visualize(user_movie, train)

    # tell the user if you think they will like it!
    nearest = nearest_neighbors(user_movie, train, K)
    print("Closest movies from the training data:", nearest)
    print()

    # now take a vote
    resulting_label = predict(nearest, train)
    print("Do we think you'll like it?")
    print("Predicted label (0 means no, 1 means yes):", resulting_label)

main()