""" John Rachlin DS 2000: Intro to Programming with Data Filename: dataproc.py Description: A personal library of data processing utilities """ def avg(L): """ Compute the numerical average of a list of numbers. If list is empty, return 0.0 """ if len(L) > 0: return sum(L) / len(L) else: return 0.0 def impute_values(L, missing_value = ''): """ Fill in missing values with the previous value in the list L - list of numeric values missing_value - value for a missing value return new list with imputed values """ imputed = L[:] # copy the original list for i in range(len(imputed)): if imputed[i] == missing_value and i != 0: imputed[i] = imputed[i-1] return imputed def get_window(L, idx, window_size=1): """ Extract a window of values of specified size centered on the specified index L: List of values idx: Center index window_size: window size """ minrange = max(idx - window_size // 2, 0) maxrange = idx + window_size // 2 + (window_size % 2) return L[minrange:maxrange] def moving_average(L, window_size=1): """ Compute a moving average over the list L using the specified window size L: List of values window_size - The window size (default=1) return - A new list with smoothed values """ mavg = [] for i in range(len(L)): window = get_window(L, i, window_size) mavg.append(avg(window)) return mavg def percentiles(L): """ Compute percentiles of values in a list """ srt = sorted(L) # Sort items in assending order length = len(L) pct = [0.0] # smallest value is at the zeroth percentile for i in range(1, length): if srt[i] == srt[i-1]: # value unchanged pct.append(pct[i-1]) else: pct.append(i / length) return pct def extract_column(data, column, impute=False): """ Extract a column of data from a 2D dataset data - list of lists colidx - column index to be extracted impute - whether to impute missing data """ col = [] for row in data: col.append(row[column]) if impute: return impute_values(col) else: return col def read_table(filename, coltypes, headers=1, delimiter = ','): """ Read a table of data filename - the name of the file coltypes - the data type for each column (int, float, str) headers - The number of headerlines to ignore (Default:1) delimiter - field delimiter (Default: ',') Return: a table of data as a list of lists """ data = [] with open(filename, 'r') as infile: # ignore headers for _ in range(headers): infile.readline() # Read remaining lines for line in infile: vals = line.strip().split(delimiter) # Convert data type for i in range(len(vals)): if vals[i] != '': vals[i] = coltypes[i](vals[i]) data.append(vals) # add row to data return data; def read_table_to_dict(filename, coltypes, delimiter = ','): """ Read a csv file to a list of dictionaries filename - the name of the file. File must have a header. coltypes - the data type for each column (int, float, str) delimiter - field delimiter (Default: ',') Return: a table of data as a list of dictionaries """ data = [] with open(filename, 'r') as infile: # read the header header = infile.readline().strip().split(delimiter) # Read remaining lines for line in infile: rowdict = {} # parse values vals = line.strip().split(delimiter) # Store key value pairs for i in range(len(vals)): key = header[i] value = vals[i] if value != '': value = coltypes[i](value) rowdict[key] = value data.append(rowdict) # add row to data return data; def read_text(filename): """ Read a text file into one long string filename - The name of the text file returns - A string containing the complete text of the file """ data = "" for line in open(filename, "r"): data += line return data def clean_word(word): """ Clean a word, converting to lowercase and removing punctuation Parameters: word - A word to be cleaned Return: cleaned up word """ punctuation = "',?;:!.’\"" for c in punctuation: word = word.replace(c,'') word = word.lower() return word def text_to_words(text): """ Convert a string to a list of cleaned words """ words = text.split() for i in range(len(words)): words[i] = clean_word(words[i]) return words # Test code for data processing functions def main(): # imports needed for test code, not for library functions import matplotlib.pyplot as plt import random as rnd # Some a random walk L = [] x = 0 for i in range(1000): x += rnd.choice([-1, 0, 1]) L.append(x) plt.figure(figsize=(10,10), dpi=150) plt.grid() plt.scatter(range(1000), L, marker = '.') mavg10 = moving_average(L, window_size=10) plt.plot(mavg10, color='r') mavg50 = moving_average(L, window_size=200) plt.plot(mavg50, color='g') plt.show() pcts = percentiles(L) plt.figure(figsize=(10,10), dpi=150) plt.plot(sorted(L), pcts, color = 'b') plt.show() if __name__ == '__main__': main()