''' DS2000 Spring 2022 Practicing with pandas for prez election data Useful from Pandas... - import pandas as pd - Read in a CSV file to a dataframe: pd.read_csv() - See the first few rows: df.head() - Summarize the quick stats: df.describe() - Keep only the columns I like: df = df[["col1", "col2"]] - Group together data with same col1,col2 values: df = df.grouby(["col1", "col2"]) ''' import pandas as pd import matplotlib.pyplot as plt PREZ = "1976-2020-president.csv" def main(): # Step one --- read in from the file to a dataframe df = pd.read_csv(PREZ, sep = "\t") # Get a peek of what's in the file print(df.head(20)) # What are the columns? print(df.columns) # Summary of numeric data print(df.describe()) # What are the datatypes of my data? print(df.dtypes) # Keep only the columns I like df = df[["year", "state", "candidate", "candidatevotes", "party_simplified"]] print(df.head(20)) print(df["candidate"].head(20)) # Group together by year and party df = df.groupby(["year", "party_simplified"]).sum().reset_index() print(df.head(20)) print() # Sort within each year by candidatevotes df = df.sort_values(by = ["year", "candidatevotes"]) print(df.head(20)) print() # Get rid of any row where the party is not # a dem or a repub df = df[(df["party_simplified"] == "DEMOCRAT") | (df["party_simplified"] == "REPUBLICAN")] df = df.sort_values(by = ["year", "party_simplified"]) print(df.head(20)) # Make a list of dem votes and list of rep votes by using df.loc dems = df.loc[df["party_simplified"] == "DEMOCRAT", "candidatevotes"] reps = df.loc[df["party_simplified"] == "REPUBLICAN", "candidatevotes"] # Get the distinct years, no dupes years = df["year"].unique() # Make a line chart of votes over the years plt.plot(years, dems, "-o", color = "blue") plt.plot(years, reps, "-o", color = "red") main()