''' DS2500 Spring 2025 Sample code from lecture - normalizing adm rate and tuition Goals: - draw lineplot of time vs tuition & time vs admission rate - normalize both datasets - draw lineplots again, and compute euclidean distance - compute std dev of each dataset ''' import statistics from scipy import stats import matplotlib.pyplot as plt import seaborn as sns from utils import * ADM_FILE = "data/admission_rate.csv" TUITION_FILE = "data/tuition.csv" DATA_COL = 1 YEAR_COL = 0 def plot_comparison(x_vals, y_vals, title, xlabel): ''' given a list of x values and a 2d list of y vals, plot all the y-val lists against the x-val list ''' for label, yval in y_vals.items(): sns.lineplot(x = x_vals, y = yval, label = label) plt.xlabel(xlabel) plt.title(title) plt.legend() plt.show() def main(): # gather data - read in admissions and tuition data, pull out important columns adm_data = read_csv(ADM_FILE) tuition_data = read_csv(TUITION_FILE) adm_lst = col_to_lst(adm_data, DATA_COL) tuition_lst = col_to_lst(tuition_data, DATA_COL) year_lst = col_to_lst(tuition_data, YEAR_COL) # convert our lists of strings into floats adm_lst = [float(adm) for adm in adm_lst] tuition_lst = [clean_currency(tuition) for tuition in tuition_lst] year_lst = [int(year) for year in year_lst] # Plot year vs tuition, year vs admin rate # omg it looks terrible and misleading! Like adm rates have stayed flat this whole time... plot_comparison(year_lst, {"Admission" : adm_lst, "Tuition" : tuition_lst}, "Admission Rate, Tuition at Northeastern 2013-2021", xlabel = "Year") # normalize our tuition and adm rate data, and plot again norm_tuition = normalize(tuition_lst) norm_adm = normalize(adm_lst) plot_comparison(year_lst, {"Admission" : norm_adm, "Tuition" : norm_tuition}, "Admission Rate, Tuition at Northeastern 2013-2021", xlabel = "Year") # std dev of tuition? var = statistics.variance(tuition_lst) print(f"The std deviation of tuition 2013-2021 is {round(var ** .5, 3)}") if __name__ == "__main__": main()