""" John Rachlin DS 2000: Intro to Programming with Data Filename: dataproc.py Description: A personal library of data processing utilities """ def avg(L): """ Compute the numerical average of a list of numbers. If list is empty, return 0.0 """ if len(L) > 0: return sum(L) / len(L) else: return 0.0 def impute_values(L, missing_value = ''): """ Fill in missing values with the previous value in the list L - list of numeric values missing_value - value for a missing value return new list with imputed values """ imputed = L[:] # copy the original list for i in range(len(imputed)): if imputed[i] == missing_value and i != 0: imputed[i] = imputed[i-1] return imputed def get_window(L, idx, window_size=1): """ Extract a window of values of specified size centered on the specified index L: List of values idx: Center index window_size: window size """ minrange = max(idx - window_size // 2, 0) maxrange = idx + window_size // 2 + (window_size % 2) return L[minrange:maxrange] def moving_average(L, window_size=1): """ Compute a moving average over the list L using the specified window size L: List of values window_size - The window size (default=1) return - A new list with smoothed values """ mavg = [] for i in range(len(L)): window = get_window(L, i, window_size) mavg.append(avg(window)) return mavg def percentiles(L): """ Compute percentiles of values in a list """ srt = sorted(L) # Sort items in assending order length = len(L) pct = [0.0] # smallest value is at the zeroth percentile for i in range(1, length): if srt[i] == srt[i-1]: # value unchanged pct.append(pct[i-1]) else: pct.append(i / length) return pct def extract_column(data, colidx, impute=False): """ Extract a column of data from a 2D dataset data - list of lists colidx - column index to be extracted impute - whether to impute missing data """ col = [] for row in data: col.append(row[colidx]) if impute: return impute_values(col) else: return col def read_table(filename, coltypes, headers=1, delimiter = ','): """ Read a table of data filename - the name of the file coltypes - the data type for each column (int, float, str) headers - The number of headerlines to ignore (Default:1) delimiter - field delimiter (Default: ',') Return: a table of data as a list of lists """ data = [] with open(filename, 'r') as infile: # ignore headers for _ in range(headers): infile.readline() # Read remaining lines for line in infile: vals = line.strip().split(delimiter) # Convert data type for i in range(len(vals)): if vals[i] != '': vals[i] = coltypes[i](vals[i]) data.append(vals) # add row to data return data; # Test code for data processing functions def main(): # imports needed for test code, not for library functions import matplotlib.pyplot as plt import random as rnd # Some a random walk L = [] x = 0 for i in range(1000): x += rnd.choice([-1, 0, 1]) L.append(x) plt.figure(figsize=(10,10), dpi=150) plt.grid() plt.scatter(range(1000), L, marker = '.') mavg10 = moving_average(L, window_size=10) plt.plot(mavg10, color='r') mavg50 = moving_average(L, window_size=200) plt.plot(mavg50, color='g') plt.show() pcts = percentiles(L) plt.figure(figsize=(10,10), dpi=150) plt.plot(sorted(L), pcts, color = 'b') plt.show() if __name__ == '__main__': main()