#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Felix Muzny
10/18/2022
DS 2000

A personal library of data processing & file reading utilities.

For Prof. Rachlin's version: 
https://course.ccs.neu.edu/ds2000/john/11/dataproc.py

"""

def read_data_dict(filename, type_cast_dict = {}):
    """
    Reads in the data in a given file
    and stores the values in a list of dicts of strings (by default).
    Assumes that commas separate row items in the given file.

    Parameters
    ----------
    filename : string
        name of the file

    type_casts: dict, optional
        type specification for each column in the data
    Returns
    -------
    data : list of dicts
        list of dicts of values for all lines in the file
    """
    file = open(filename, "r")
    data = []
   
    headers = file.readline()
    headers = headers.strip().split(",")
     
    for line in file:
        pieces = line.strip().split(",")
        
        row_dict = {}
        # go through each column and link the value
        # to the appropriate header
        for i in range(len(pieces)):
        
            # {"rotten_tomato": int, "IMDB": float}
            if headers[i] in type_cast_dict:
                cast_func = type_cast_dict[headers[i]]
                row_dict[headers[i]] = cast_func(pieces[i])
            else:
                row_dict[headers[i]] = pieces[i]
                
        data.append(row_dict)
        
    # oops! Prof. Felix forgot this!
    file.close()
    return data

def read_data(filename, skip_header = False, type_casts = None):
    """
    Reads in the data in a given file
    and stores the values in a list of lists of strings (by default).
    Assumes that commas separate row items in the given file.

    Parameters
    ----------
    filename : string
        name of the file
    skip_header: boolean, optional
        whether or not to skip a header row. Default to False.
    type_casts: list, optional
        type specification for each column in the data
    Returns
    -------
    data : list of lists
        list of lists of values for all lines in the file

    """
    file = open(filename, "r")
    data = []
    # do we need to skip the first row?
    if skip_header:
        file.readline()
        
    for line in file:
        pieces = line.strip().split(",")
        # [str, str, int, float]
        # ['Black Adam', 'PG-13', '39', '7.1']
        if type_casts is not None:
            # go through each column and do the proper conversion
            for i in range(len(pieces)):
                cast_func = type_casts[i]
                pieces[i] = cast_func(pieces[i])
            
        data.append(pieces)
        
    file.close()
    return data


def convert_mutate(ls, cast = int):
    """
    Converts all values in given list according to the given function.
    Changes to ints by default.
    Mutates the given list.

    Parameters
    ----------
    ls : list
        list of values to be converted.
    cast : function, optional
        Function to be applied to every element in the list. The default is int.

    Returns
    -------
    None.
    """
    for i in range(len(ls)):
        ls[i] = cast(ls[i])
        
def convert(ls, cast = int):
    """
    Converts all values in given list according to the given function.
    Does not mutate the given list.

    Parameters
    ----------
    ls : list
        list of values to be converted.
    cast : function, optional
        Function to be applied to every element in the list. The default is int.

    Returns
    -------
    a new list of converted values.
    """
    new_ls = []
    for i in range(len(ls)):
        new_ls.append(cast(ls[i]))
    return new_ls
        
        
def get_column(data, column_index):
    """
    Reads in the data from one column in a given list of 
    lists. Assumes that all sub-lists have the same length
    and that the given column index is a valid column index.

    Parameters
    ----------
    data : list of lists
        full dataset as a 2d list
    column_index : int
        0-indexed number indicating which column to read 

    Returns
    -------
    column : list
        list of values for all rows for the target column
    """
    column = []
    for row in data:
        column.append(row[column_index])
    return column


def fill_values(ls, default_val = "0.0"):
    """
    Fill in empty/missing values in list with a default value.
    Mutates the passed-in list.

    Parameters
    ----------
    ls : list
        List with empty/missing values.
    default_val : any type, optional
        Value to replace "" with. The default is "0.0".

    Returns
    -------
    None.

    """
    
    for i in range(len(ls)):
        if ls[i].strip() == "":
            ls[i] = default_val
    

def main():
    print("THIS IS THE MAIN FUNCTION")
    print("Some examples of data processing usage")
    data = read_data("boston_earnings.csv")
    
    titles = get_column(data, 0)
    print(titles[:5])
    print()
    
    salaries = get_column(data, 1)
    print(salaries[:5])
    fill_values(salaries)
    print(salaries[:5])
    salaries = convert(salaries, cast = float)
    print(salaries[:5])
    print("*************")
   
# UPDATED FROM LECTURES 10/18
# Add a guard so that main() doesn't run if we import this file
#
# only want to run main if we are running this file
# and not if we are importing it as a library/module
if __name__ == "__main__":
    main()