from utils.rna import translate_dna from utils import check_answers from pprint import pprint import re def parse_gtf(lines): ''' Parses lines from a GTF file and returns a list of dictionaries with the data. Note that the dicts take more memory than just iterating over lines so if you have many lines but are only interested in a few of them try filtering them first to avoid creating too many dicts. ''' rows = [] for line in lines: if line.startswith('#'): continue fields = line.strip().split('\t') d = {} # print(line, fields) d['seqname'] = fields[0] d['source'] = fields[1] d['feature'] = fields[2] d['start'] = fields[3] d['end'] = fields[4] d['score'] = fields[5] d['strand'] = fields[6] d['frame'] = fields[7] attributes = fields[8] for attribute in re.split(';\s*', attributes.strip(';')): # attribute looks like 'gene_version "5"' key_and_value = attribute.split(' ', maxsplit=1) key = key_and_value[0] value = key_and_value[1] d[key] = value # clean up data: for key in d: d[key] = d[key].strip('"') if re.match('^\d+$', d[key]): d[key] = int(d[key]) rows.append(d) return rows example_line = '1\thavana\tgene\t11869\t14409\t.\t+\t.\tgene_id "ENSG00000223972"; gene_version "5"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene";' # pprint([parse_gtf([example_line])], sort_dicts=False) CFTR_gene_id = 'ENSG00000001626' CFTR_lines = [] fh = open('./data/Homo_sapiens.GRCh38.93.gtf', 'r') for line in fh: if CFTR_gene_id in line: CFTR_lines.append(line) fh.close() rows = parse_gtf(CFTR_lines) # pprint(rows[:2], sort_dicts=False) num_transcripts = 0 for row in rows: if row['feature'] == 'transcript' and row['gene_id'] == CFTR_gene_id: num_transcripts += 1 print(f''' 1. How many transcripts can the CFTR gene generate? {num_transcripts = } ''') longest_length = 0 longest_row = None for row in rows: if row['feature'] == 'transcript' and row['gene_id'] == CFTR_gene_id: length = row['end'] - row['start'] + 1 if length > longest_length: longest_length = length longest_row = row print(f''' 2. Which of these transcripts is the longest transcript in nucleotides? {longest_length = } {longest_row['transcript_id'] = } ''') def fasta_subsequence(filename, start, end): ''' Opens a fasta file with only one sequence in it and returns the subsequence from start to end inclusive using 1-indexing. ''' fh = open(filename, 'r') seq = '' for line in fh: if line.startswith('>'): continue seq += line.strip() fh.close() return seq[start - 1: end] def fasta_subsequences(filename, start_end_list): ''' Opens a fasta file with only one sequence in it and returns the concatenation of subsequences of start to end inclusive using 1-indexing in the list start_end_list . ''' fh = open(filename, 'r') seq = '' for line in fh: if line.startswith('>'): continue seq += line.strip() fh.close() out = '' for start_end in start_end_list: start = start_end[0] end = start_end[1] out += seq[start - 1: end] return out c7_filename = './data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa' longest_seq = fasta_subsequence(c7_filename, longest_row['start'], longest_row['end']) print(f''' 3. Fetch the DNA sequence for that transcript. {longest_seq[:10] = } {len(longest_seq) = } ''') fh = open('longest_seq.txt', 'w') fh.write(longest_seq) fh.close() check_answers.ex3('longest_seq.txt') exons_pos = [] for row in rows: if row['feature'] == 'exon' and row['transcript_id'] == longest_row['transcript_id']: exons_pos.append([row['start'], row['end']]) mrna_seq = fasta_subsequences(c7_filename, exons_pos) print(f''' 4. Fetch the DNA sequences of all exons for that transcript, spliced together to one sequence. {exons_pos[:2] = } {len(exons_pos) = } {mrna_seq[:10] = } {len(mrna_seq) = } ''') fh = open('mrna_seq.txt', 'w') fh.write(mrna_seq) fh.close() check_answers.ex4('mrna_seq.txt') for row in rows: if row.get('transcript_id') == longest_row['transcript_id']: if row['feature'] == 'start_codon': start_codon_row = row if row['feature'] == 'stop_codon': stop_codon_row = row start_codon_seq = fasta_subsequence(c7_filename, start_codon_row['start'], start_codon_row['end']) stop_codon_seq = fasta_subsequence(c7_filename, stop_codon_row['start'], stop_codon_row['end']) print(f''' 5. What are the start positions and sequences of the start_codon and stop_codon for that transcript? {start_codon_row['start'] = } {start_codon_seq = } {stop_codon_row['start'] = } {stop_codon_seq = } ''') first_exon_pos = exons_pos[0] first_exon_start = first_exon_pos[0] first_exon_stop = first_exon_pos[1] cds_pos = [[start_codon_row['start'], first_exon_stop]] + exons_pos[1:] cds = fasta_subsequences(c7_filename, cds_pos) protein = translate_dna(cds) print(f''' 6. Translate the above sequence of all exons into amino acids {cds_pos[:2] = } {len(cds_pos) = } {cds[:10] = } {len(cds) = } {protein[:10] = } {len(protein) = } ''') fh = open('protein.txt', 'w') fh.write(protein) fh.close() check_answers.ex6('protein.txt') files = [ './data/Patient1.fa', './data/Patient2.fa', './data/Patient3.fa', './data/Patient4.fa', './data/Patient5.fa', ] print('=== FINAL REPORT ===') for patient_file in files: patient_cds = fasta_subsequences(patient_file, cds_pos) patient_protein = translate_dna(patient_cds) if len(patient_protein) < len(protein): print(f'{patient_file} is at risk') print('=== END OF REPORT ===')