"""
John Rachlin
DS 2000: Intro to Programming with Data


Date: Thu Dec  1 18:57:24 2022

File: genome.py
    
Description: An exploration of the human genome
    
    
"""

import dataproc as dp
import matplotlib.pyplot as plt


def read_fasta(filename):
    """ Read fasta-formatted genome sequence
    filename - The name of the file
    returns  - The genome sequence as a string
    """
    with open(filename, 'r') as infile:
        infile.readline() # skip fasta header
        data = infile.read().replace('\n', '')
    return data


def gc(base):
    """ Return 1 if base is G/C, 0 otherwise """
    if base in ['G','C','g','c']:
        return 1
    else:
        return 0

def gc_encode(seq):
    """ Convert a DNA sequences to 1's (G/C) and 0's (A/T)"""
    return [gc(base) for base in seq]


def percent_gc(seq, precision=3):
    """ Compute the percentage of Gs and Cs
    in a nucleotide sequence
    seq - A DNA sequence
    precision - # of decimal places
    return - % GC of the sequence
    """
    return round(sum(gc_encode(seq)) / len(seq), precision)
    

def find_consecutive_bases(seq, letter, min_reps=20):
    """ Find the locations of repeating bases, and the
    associated number repeats
    seq - DNA sequence
    letter - nucleotide we are looking for (A,T,G, or C)
    min_reps - ignore repetitions below this count
    returns - List of locations and list of repetition counts
    """
    
    location = []
    consec = []
    
    
    base = seq[0]
    reps = 0
    for i in range(1, len(seq)):
        
        if seq[i] == seq[i-1]:
            reps += 1 
        else:
            if reps >= min_reps:
                consec.append(reps)
                location.append(i - reps)
                reps = 0
                
    return location, consec


def avg_gc(seq, window_size=2000):
    """ Moving average GC content for a given sequence 
    seq - A DNA Sequence
    window_size - The window size for the moving average
    return - moving average sequence
    """
    encoded = gc_encode(seq)
    avg = dp.moving_average(encoded, window_size=window_size)
    return avg
    
def plot_gc(human, chimp):
    """ Plot average GC content for human and chimp """
    havg = avg_gc(human)
    cavg = avg_gc(chimp)
    plt.figure(figsize=(10,4), dpi=200)
    plt.ylim(0.2, 0.6)
    plt.grid()
    plt.plot(havg, color='blue', label='human')
    plt.plot(cavg, color='red', label='chimp')
    plt.xlabel('DNA Position (Chimp offset=27k bases)')
    plt.ylabel('GC (moving avg., window=2000)')
    plt.title('Chromosome 3: GC content')
    plt.legend()
    plt.savefig('pct_gc.png')
    plt.show()
    
def plot_reps(human, chimp):
    """ Plot moving average repeat counts for human and chimp """
    hloc, hcon = find_consecutive_bases(human, 'A', min_reps=1)
    cloc, ccon = find_consecutive_bases(chimp, 'A', min_reps=1)
    plt.figure(figsize=(10,4), dpi=200)
    plt.grid()
    plt.ylim(1.4, 1.6)
    plt.xlabel('DNA Position (Chimp offset=27k bases)')
    plt.ylabel('Repeat-A length (moving avg., window=2000)')
    plt.title('Chromosome 3: Repeating Base Lengths')
    hsmooth = dp.moving_average(hcon, window_size=2000)
    csmooth = dp.moving_average(ccon, window_size=2000)
    #plt.plot(hsmooth, color='blue', label='human')
    #plt.plot(csmooth, color='red', label='chimp')
    plt.scatter(hloc, hsmooth, marker='.', s=1, color='blue', label='human')
    plt.scatter(cloc, csmooth, marker='.', s=1, color='red', label='chimp')

    plt.legend()
    plt.savefig('repeats.png')
    plt.show()

    
def main():
    # read human and chimp sequences
    human = read_fasta('human_small.fa')
    offset = 27000
    chimp = read_fasta('chimp_small.fa')[offset:]
    
    
    # Plot average GC over the sequence
    plot_gc(human, chimp)
    
    # Let's look more closely at the sequences
    # where GC content diverges
    human_frag = human[134000:136000]
    print(human_frag)
    print("Human Fragment %GC: ", percent_gc(human_frag))
    
    chimp_frag = chimp[134000:136000]
    print(chimp_frag)
    print("Chimp Fragment %GC: ", percent_gc(chimp_frag))
    
    # Plot repetition locations
    plot_reps(human, chimp)
    
    
if __name__ == '__main__':
    main()