"""

John Rachlin
DS 2000: Intro to Programming with Data

Filename: dataproc.py
    
Description: A personal library of data processing utilities

    
"""



def avg(L):
    """ Compute the numerical average of a list of numbers.
    If list is empty, return 0.0 """
    
    if len(L) > 0:
        return sum(L) / len(L)
    else:
        return 0.0
    
    
    

def impute_values(L, missing_value = ''):
    """ Fill in missing values with the previous value in the list
    L - list of numeric values
    missing_value - value for a missing value
    return new list with imputed values
    """
    
    imputed = L[:] # copy the original list
    for i in range(len(imputed)):
        if imputed[i] == missing_value and i != 0:
            imputed[i] = imputed[i-1]
    return imputed



def get_window(L, idx, window_size=1):
    """ Extract a window of values of specified size
    centered on the specified index
    L: List of values
    idx: Center index
    window_size: window size
    """
    
    minrange = max(idx - window_size // 2, 0)
    maxrange = idx + window_size // 2 + (window_size % 2)
    return L[minrange:maxrange]


def moving_average(L, window_size=1):
    """ Compute a moving average over the list L
    using the specified window size
    L: List of values
    window_size - The window size (default=1)
    return - A new list with smoothed values
    """
    mavg = []
    for i in range(len(L)):
        window = get_window(L, i, window_size)
        mavg.append(avg(window))

    return mavg    
    
    

def percentiles(L):
    """ Compute percentiles of values in a list """
    srt = sorted(L) # Sort items in assending order
    length = len(L)
    
    pct = [0.0]  # smallest value is at the zeroth percentile
    for i in range(1, length):
        if srt[i] == srt[i-1]: # value unchanged
            pct.append(pct[i-1])
        else:
            pct.append(i / length)
            
    return pct




def extract_column(data, column, impute=False):
    """ Extract a column of data from a 2D dataset
    data - list of lists
    colidx - column index to be extracted
    impute - whether to impute missing data
    """
    col = []
    for row in data:
        col.append(row[column])
    
    if impute:
        return impute_values(col)
    else:
        return col
    
    
    

def read_table(filename, coltypes, headers=1, delimiter = ','):
    """ Read a table of data
    filename - the name of the file
    coltypes - the data type for each column (int, float, str)
    headers - The number of headerlines to ignore (Default:1)
    delimiter - field delimiter (Default: ',')
  
    Return: a table of data as a list of lists
    """
    
    data = []
    
    with open(filename, 'r') as infile:
        
        # ignore headers
        for _ in range(headers):
            infile.readline()
            
        # Read remaining lines
        for line in infile:
            vals = line.strip().split(delimiter)
            
            # Convert data type
            for i in range(len(vals)):
                if vals[i] != '':
                    vals[i] = coltypes[i](vals[i])
                
            data.append(vals) # add row to data
        
    return data;
            

    

def read_table_to_dict(filename, coltypes, delimiter = ','):
    """ Read a csv file to a list of dictionaries
    filename - the name of the file.  File must have a header.
    coltypes - the data type for each column (int, float, str)
    delimiter - field delimiter (Default: ',')
  
    Return: a table of data as a list of dictionaries
    """
    
    data = []
    
    with open(filename, 'r') as infile:
        
        # read the header
        header = infile.readline().strip().split(delimiter)
            
        # Read remaining lines
        for line in infile:
            rowdict = {}
            
            # parse values
            vals = line.strip().split(delimiter)
            
            # Store key value pairs

            for i in range(len(vals)):
                key = header[i]
                value = vals[i]
                if value != '':
                    value = coltypes[i](value)
                rowdict[key] = value
                
            data.append(rowdict) # add row to data
        
    return data;  
  
    
  
    
  
                                        
def read_text(filename):
    """ Read a text file into one long string
    filename - The name of the text file
    returns  - A string containing the complete text of the file
    """
    
    data = ""
    for line in open(filename, "r"):
        data += line
    return data


def clean_word(word):
    """ Clean a word, converting to lowercase
    and removing punctuation
    
    Parameters: word - A word to be cleaned
    Return:     cleaned up word
    """
    
    punctuation = "',?;:!.’\""
    for c in punctuation:
        word = word.replace(c,'')
        
       
    word = word.lower()
    return word


def text_to_words(text):
    """ Convert a string to a list of cleaned words """
    
    words = text.split()
    for i in range(len(words)):
        words[i] = clean_word(words[i])
                
    return words



# Test code for data processing functions
def main():
    
    # imports needed for test code, not for library functions
    import matplotlib.pyplot as plt
    import random as rnd
    
    # Some a random walk
    
    L = []
    x = 0
    for i in range(1000):
        x += rnd.choice([-1, 0, 1])
        L.append(x)
        
    plt.figure(figsize=(10,10), dpi=150)
    plt.grid()
    plt.scatter(range(1000), L, marker = '.')
    
    
    mavg10 = moving_average(L, window_size=10)
    plt.plot(mavg10, color='r')
        
    mavg50 = moving_average(L, window_size=200)
    plt.plot(mavg50, color='g')
    plt.show()
    
    pcts = percentiles(L)
    plt.figure(figsize=(10,10), dpi=150)
    plt.plot(sorted(L), pcts, color = 'b')
    plt.show()
    


    
if __name__ == '__main__':
    main()

