""" John Rachlin DS 2000: Intro to Programming with Data Date: Thu Dec 1 18:57:24 2022 File: genome.py Description: An exploration of the human genome """ import dataproc as dp import matplotlib.pyplot as plt def read_fasta(filename): """ Read fasta-formatted genome sequence filename - The name of the file returns - The genome sequence as a string """ with open(filename, 'r') as infile: infile.readline() # skip fasta header data = infile.read().replace('\n', '') return data def gc(base): """ Return 1 if base is G/C, 0 otherwise """ if base in ['G','C','g','c']: return 1 else: return 0 def gc_encode(seq): """ Convert a DNA sequences to 1's (G/C) and 0's (A/T)""" return [gc(base) for base in seq] def percent_gc(seq, precision=3): """ Compute the percentage of Gs and Cs in a nucleotide sequence seq - A DNA sequence precision - # of decimal places return - % GC of the sequence """ return round(sum(gc_encode(seq)) / len(seq), precision) def find_consecutive_bases(seq, letter, min_reps=20): """ Find the locations of repeating bases, and the associated number repeats seq - DNA sequence letter - nucleotide we are looking for (A,T,G, or C) min_reps - ignore repetitions below this count returns - List of locations and list of repetition counts """ location = [] consec = [] base = seq[0] reps = 0 for i in range(1, len(seq)): if seq[i] == seq[i-1]: reps += 1 else: if reps >= min_reps: consec.append(reps) location.append(i - reps) reps = 0 return location, consec def avg_gc(seq, window_size=2000): """ Moving average GC content for a given sequence seq - A DNA Sequence window_size - The window size for the moving average return - moving average sequence """ encoded = gc_encode(seq) avg = dp.moving_average(encoded, window_size=window_size) return avg def plot_gc(human, chimp): """ Plot average GC content for human and chimp """ havg = avg_gc(human) cavg = avg_gc(chimp) plt.figure(figsize=(10,4), dpi=200) plt.ylim(0.2, 0.6) plt.grid() plt.plot(havg, color='blue', label='human') plt.plot(cavg, color='red', label='chimp') plt.xlabel('DNA Position (Chimp offset=27k bases)') plt.ylabel('GC (moving avg., window=2000)') plt.title('Chromosome 3: GC content') plt.legend() plt.savefig('pct_gc.png') plt.show() def plot_reps(human, chimp): """ Plot moving average repeat counts for human and chimp """ hloc, hcon = find_consecutive_bases(human, 'A', min_reps=1) cloc, ccon = find_consecutive_bases(chimp, 'A', min_reps=1) plt.figure(figsize=(10,4), dpi=200) plt.grid() plt.ylim(1.4, 1.6) plt.xlabel('DNA Position (Chimp offset=27k bases)') plt.ylabel('Repeat-A length (moving avg., window=2000)') plt.title('Chromosome 3: Repeating Base Lengths') hsmooth = dp.moving_average(hcon, window_size=2000) csmooth = dp.moving_average(ccon, window_size=2000) #plt.plot(hsmooth, color='blue', label='human') #plt.plot(csmooth, color='red', label='chimp') plt.scatter(hloc, hsmooth, marker='.', s=1, color='blue', label='human') plt.scatter(cloc, csmooth, marker='.', s=1, color='red', label='chimp') plt.legend() plt.savefig('repeats.png') plt.show() def main(): # read human and chimp sequences human = read_fasta('human_small.fa') offset = 27000 chimp = read_fasta('chimp_small.fa')[offset:] # Plot average GC over the sequence plot_gc(human, chimp) # Let's look more closely at the sequences # where GC content diverges human_frag = human[134000:136000] print(human_frag) print("Human Fragment %GC: ", percent_gc(human_frag)) chimp_frag = chimp[134000:136000] print(chimp_frag) print("Chimp Fragment %GC: ", percent_gc(chimp_frag)) # Plot repetition locations plot_reps(human, chimp) if __name__ == '__main__': main()