#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Felix Muzny 12/2/2022 DS 2000 Lecture 23 - timing experiments Some code to play around with time! """ import data_utils import time import pandas as pd import matplotlib.pyplot as plt import random # for students in section 2 def generate_coordinate_data(rows): """ Generates a two dimension list of x, y coordates where x and y are between 0 and 100 Parameters ---------- rows : int Number of rows to include in the data. Returns ------- data : list list of lists of integers. """ data = [] for row_num in range(rows): row = [random.randint(0, 100) for i in range(2)] data.append(row) return data # sections 3 & 4 def generate_list(num_elements): """ Generates a one-dimensional list of integers with values between 0 and 100. Parameters ---------- num_elements : int number of integers that should be in the list. Returns ------- data : list generated list of integers. """ # data = [] # for i in range(num_elements): # data.append(random.randint(0, 100)) # list comprehension data = [random.randint(0, 100) for i in range(num_elements)] # you can also use the variable i in the comprehension # if you'd like to! # data = [i for i in range(num_elements)] return data def main(): # To test out: # - loading data (trips) # - getting a column # timing questions (in general) # will the amount of time increase with the number of elements # in the list? # if yes, will it increase... # linearly? # exponentially? # something else? ls = generate_list(100) print(ls) print() # A. no increase <--- (python stores list length as an attribute) # B. linear increase # C. more than linear increase # let's time the len() function! # max() # A. no increase # B. linear increase <---- # C. more than linear increase xs = [] ys = [] for list_length in range(10000, 100000, 10000): ls = generate_list(list_length) start = time.time() max_element = max(ls) end = time.time() xs.append(list_length) ys.append(end - start) print("Number of elements:", list_length) print(end - start) print() plt.plot(xs, ys) plt.show() # data loading, # homegrown version 2.5x longer than pandas # data utils functions start = time.time() # run the list function data = data_utils.read_data_dict("trips.csv", {"duration": int, "bike_id": int}) end = time.time() duration = end - start print("Our data loading function took:", duration) print() # pandas functions start = time.time() # run the list function df = pd.read_csv("trips.csv") end = time.time() duration = end - start print("pandas data loading function took:", duration) print() # time the get_column and pandas column functions # get_column # 0.0137 # pandas # A. >= 0.0137 # B. 0.005 - 0.0137 # C. 0.001 - 0.005 <--- 0.003, 0.0012 # D. < 0.001 <- 0.00095 # data utils functions start = time.time() # run the list function durations = data_utils.get_column(data, "duration") end = time.time() duration = end - start print("Our get_column function took:", duration) print() # pandas functions start = time.time() # run the data function durations = df["duration"] end = time.time() duration = end - start print("pandas column access function took:", duration) print() if __name__ == "__main__": main()