''' DS2500 Spring 2025 Sample code from class 2/4/25 Our goal today: - determine whether admission rate & tuition are correlated - if so, then apply a linear regression... - given a new admisison rate, predict what tuition will be? - draw the line of best fit Libraries for today, might need to install: - statistics (correlation) - scipy (compute the line of best fit, slope/intercept) - seaborn (draws the line of best fit) ''' import statistics from scipy import stats import matplotlib.pyplot as plt import seaborn as sns from utils import * ADM_FILE = "data/admission_rate.csv" TUITION_FILE = "data/tuition.csv" DATA_COL = 1 def main(): # gather data - read in admissions and tuition data, pull out important columns adm_data = read_csv(ADM_FILE) tuition_data = read_csv(TUITION_FILE) adm_lst = col_to_lst(adm_data, DATA_COL) tuition_lst = col_to_lst(tuition_data, DATA_COL) # convert our lists of strings into floats adm_lst = [float(adm) for adm in adm_lst] tuition_lst = [clean_currency(tuition) for tuition in tuition_lst] # are these two variables correlated? corr = statistics.correlation(adm_lst, tuition_lst) print(f"Correlation between adm and tuition: {corr}") # yes, they are strongly negatively correlated! # so a linear regression would be a reasonable next step :) lr = stats.linregress(adm_lst, tuition_lst) adm_x = 5 predicted_tuition = adm_x * lr.slope + lr.intercept print(f"With admission rate {adm_x}, we predict tuition to be ${predicted_tuition}") # draw the line of best fit sns.regplot(x = adm_lst, y = tuition_lst, color = "pink") plt.show() if __name__ == "__main__": main()