#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Felix Muzny 11/1/2022 DS 2000 Lecture 16 - string functions, data, designing programs Logistics: - Homework 7 is due Friday @ 9pm - Dictionaries are required. - Quiz 7 is available. - remote attendance (https://bit.ly/remote-ds2000-muzny) Three ways to participate (please do one of these!) 1) via the PollEverywhere website: https://pollev.com/muzny 2) via text: text "muzny" to the number 22333 to join the session 3) via Poll Everywhere app (available for iOS or Android) Warm-up 0: Lecture 15 --- Have you watched the asynchronous lecture from last Friday? A. yes B. no C. no, but I'll do that later today Have you read the HW 7 assignment? A. yes B. no C. no, but I'll do that later today """ """ Strings! --- What do we know so far? - characters that you can't do math with (even if they are "72") - grouped together using quotation marks (have an order) - when we read in data, we read it as strings - we find them everywhere (e.g. in our csv files) - certain data can only be represented as strings - for functions: - pass them to print() - input() both takes (as a parameter) and returns a string - we can use string functions like .upper() to get different version of strings - casting functions like int() or float() """ animal = "turtle" print(animal) print(animal.upper()) # ask how long a string is print(len(animal)) # get the first character in a string print(animal[0]) print() """ Iterating through strings --- Strings are 0-indexed 012345 "turtle" """ # with position/index: for i in range(len(animal)): print(i, ":", animal[i]) print() # we can also iterate by value (by character) for letter in animal: print(letter) print() # OOOPS I FORGOT TO CALL SPLIT fruits = "mango banana apple kiwi" # OH NO, iterating by letter, not by word! for fruit in fruits: # fruit = "m" # fruit[0] -> m print(fruit) if fruit[0] == "a": print("A FRUIT") print() """ Using string functions --- Python provides so many built in string functions For all of them, we need to remember that strings are **IMMUTABLE**. This is different than lists. """ ls = [1, 5, 8, 97] print(ls) # once I've created a list, I can change its contents # this means that it's mutable ls[0] = -999 print(ls) ls.append(-999) print(ls) print() # strings are immutable animal = "cats are the most wonderful ever" print(animal) # TypeError: 'str' object does not support item assignment # animal[0] = "b" print(animal) # creating a NEW STRING animal2 = "b" + animal[1:] print(animal) print(animal2) # whenever you use a string function, it will return # a new string upper_animal = animal.upper() print(animal) print(upper_animal) print(upper_animal.isupper()) """ Writing a string function --- Write a function, count, that takes in one string and one letter and counts how many times that letter occurs in the given string. """ """ Data: what questions are we asking? --- Take a look at the dataset: https://provost.northeastern.edu/uds/facts/common-data-set/ What questions do we have? - is the data accurate? (where does it come from?) - of the total accepted students how many enrolled? - is there a csv available? - does the dataset contain the information that I'm looking for? For any data set that someone gives you, what questions would you want the answers to before you start any analysis? - where does the data come from? - who does the data represent? - who/what is missing from the data? - (where are transgender and non-binary people represented in this data?) - what questions could we answer using this data? - (we compare admissions statistic for racial and ethnic categories) """ """ Designing programs & Functions (warm-ups from Sec 2) --- """ """ Given the following two functions, write one new function that has the same behavior and can be used for both cases. """ # *~*~ program 1 ~*~* # Remember that whenever we import data utils, # data_utils.py import data_utils def get_column(data, column_index): """ Reads in the data from one column in a given list of lists. Assumes that all sub-lists have the same length and that the given column index is a valid column index. Parameters ---------- data : list of lists full dataset as a 2d list column_index : int 0-indexed number indicating which column to read Returns ------- column : list list of values for all rows for the target column """ column = [] for row in data: column.append(row[column_index]) return column # This is where we started (roughly) # def max_rotten(data): # col_vals = data_utils.get_column(data, 2) # return max(col_vals) # def max_imdb(data): # col_vals = data_utils.get_column(data, 3) # return max(col_vals) # This is the function that we ended up with! def max_column(data, column_index): col_vals = data_utils.get_column(data, column_index) return max(col_vals) def main(): print("example program 1") data = data_utils.read_data("movies.csv", skip_header = True, type_casts = [str, str, int, float]) print(data) print() # the first function call to your new function # would be here highest_rotten = max_column(data, 2) print("maximum rotten tomatoes:", highest_rotten) # the second function call to your new function # would be here highest_imdb = max_column(data, 3) print("maximum imdb:", highest_imdb) print() if __name__ == "__main__": main() # program design is better because # - more flexible! # - don't have to write a new function each time we need a max # - less code """ Given the following function, write two new functions that are more flexible in combination than this singular function is by itself. """ # *~*~ program 2 ~*~* import data_utils def get_rated_movies(data, rating): # isolating the subset of data # that I care about movies = [] for movie in data: # the 1 is *GREAT* candidate to become a parameter :) if movie[1] == rating: movies.append(movie) return movies def max_column(data, column_index): col_vals = data_utils.get_column(data, column_index) return max(col_vals) def main(): print("example program 2") data = data_utils.read_data("movies.csv", skip_header = True, type_casts = [str, str, int, float]) r_movies = get_rated_movies(data, "R") highest_rotten_R = max_column(r_movies, 2) print("maximum rotten tomatoes for R-rated films:", highest_rotten_R) print() pg13_movies = get_rated_movies(data, "PG-13") highest_rotten_pg13 = max_column(pg13_movies, 2) print("maximum rotten tomatoes for PG-13 -rated films:", highest_rotten_pg13) print() if __name__ == "__main__": main() # program design is better because # - these two tasks aren't inherently linked # - more flexible and powerful program later # - that we can expand w/o writing new functions """ Given the following function, write two new functions that are better designed according to the principle that each function should do one of the following three things: - read in data - do a calculation or a data manipulation - display a result # Update the corresponding main() function as needed! """ # *~*~ program 3 ~*~* import data_utils def maximums(data): col_maxes = [] # do a calculation # find the number of columns column_nums = len(data[0]) for col_num in range(column_nums): column = data_utils.get_column(data, col_num) maximum_val = max(column) col_maxes.append(maximum_val) return col_maxes def report_maxes(max_vals): for maximum_val in max_vals: # now I do the display print("The maximum value of this column is:", maximum_val) def graph_maxes(max_vals): print("beautiful graph code!") def main(): data = data_utils.read_data("movies.csv", skip_header = True, type_casts = [str, str, int, float]) max_vals = maximums(data) report_maxes(max_vals) graph_maxes(max_vals) if __name__ == "__main__": main() # program design is better because # - these two tasks aren't inherently linked # - more flexible and powerful program later # - easy to display results in different ways # (or do more calculations later) """ plt.hist example """ import matplotlib.pyplot as plt """ Next time - string processing & string manipulation - data and data sources """