#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Felix Muzny 10/18/2022 DS 2000 A personal library of data processing & file reading utilities. For Prof. Rachlin's version: https://course.ccs.neu.edu/ds2000/john/11/dataproc.py """ def read_data_dict(filename, type_cast_dict = {}): """ Reads in the data in a given file and stores the values in a list of dicts of strings (by default). Assumes that commas separate row items in the given file. Parameters ---------- filename : string name of the file type_casts: dict, optional type specification for each column in the data Returns ------- data : list of dicts list of dicts of values for all lines in the file """ file = open(filename, "r") data = [] headers = file.readline() headers = headers.strip().split(",") for line in file: pieces = line.strip().split(",") row_dict = {} # go through each column and link the value # to the appropriate header for i in range(len(pieces)): # {"rotten_tomato": int, "IMDB": float} if headers[i] in type_cast_dict: cast_func = type_cast_dict[headers[i]] row_dict[headers[i]] = cast_func(pieces[i]) else: row_dict[headers[i]] = pieces[i] data.append(row_dict) # oops! Prof. Felix forgot this! file.close() return data def read_data(filename, skip_header = False, type_casts = None): """ Reads in the data in a given file and stores the values in a list of lists of strings (by default). Assumes that commas separate row items in the given file. Parameters ---------- filename : string name of the file skip_header: boolean, optional whether or not to skip a header row. Default to False. type_casts: list, optional type specification for each column in the data Returns ------- data : list of lists list of lists of values for all lines in the file """ file = open(filename, "r") data = [] # do we need to skip the first row? if skip_header: file.readline() for line in file: pieces = line.strip().split(",") # [str, str, int, float] # ['Black Adam', 'PG-13', '39', '7.1'] if type_casts is not None: # go through each column and do the proper conversion for i in range(len(pieces)): cast_func = type_casts[i] pieces[i] = cast_func(pieces[i]) data.append(pieces) file.close() return data def convert_mutate(ls, cast = int): """ Converts all values in given list according to the given function. Changes to ints by default. Mutates the given list. Parameters ---------- ls : list list of values to be converted. cast : function, optional Function to be applied to every element in the list. The default is int. Returns ------- None. """ for i in range(len(ls)): ls[i] = cast(ls[i]) def convert(ls, cast = int): """ Converts all values in given list according to the given function. Does not mutate the given list. Parameters ---------- ls : list list of values to be converted. cast : function, optional Function to be applied to every element in the list. The default is int. Returns ------- a new list of converted values. """ new_ls = [] for i in range(len(ls)): new_ls.append(cast(ls[i])) return new_ls def get_column(data, column_index): """ Reads in the data from one column in a given list of lists. Assumes that all sub-lists have the same length and that the given column index is a valid column index. Parameters ---------- data : list of lists full dataset as a 2d list column_index : int 0-indexed number indicating which column to read Returns ------- column : list list of values for all rows for the target column """ column = [] for row in data: column.append(row[column_index]) return column def fill_values(ls, default_val = "0.0"): """ Fill in empty/missing values in list with a default value. Mutates the passed-in list. Parameters ---------- ls : list List with empty/missing values. default_val : any type, optional Value to replace "" with. The default is "0.0". Returns ------- None. """ for i in range(len(ls)): if ls[i].strip() == "": ls[i] = default_val def main(): print("THIS IS THE MAIN FUNCTION") print("Some examples of data processing usage") data = read_data("boston_earnings.csv") titles = get_column(data, 0) print(titles[:5]) print() salaries = get_column(data, 1) print(salaries[:5]) fill_values(salaries) print(salaries[:5]) salaries = convert(salaries, cast = float) print(salaries[:5]) print("*************") # UPDATED FROM LECTURES 10/18 # Add a guard so that main() doesn't run if we import this file # # only want to run main if we are running this file # and not if we are importing it as a library/module if __name__ == "__main__": main()