{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Warmup tasks" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. What is the length of chromosome 7 on the reference sequence?" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "159345973\n" ] } ], "source": [ "# Open the reference file (fasta) and loop through it, removing the first line (starts with >)\n", "# Sum the length of the lines, removing the newline character\n", "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "length = 0\n", "for line in f:\n", " if not line.startswith(\">\"):\n", " length += len(line.strip()) # add the length of the line, without newline character\n", "print(length)\n", "f.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4. How many genes are annotated in the GTF file?" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "58395\n" ] } ], "source": [ "# Open the GTF file and loop throught it.\n", "# To find the genes, check the 3rd element of each line and add 1 if the feature is \"gene\"\n", "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "genes = 0\n", "for line in g:\n", " if not line.startswith(\"#\"):\n", " feature = line.split(\"\\t\")[2]\n", " if feature == \"gene\":\n", " genes +=1\n", "print(genes)\n", "g.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Architect a method" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "All following tasks are related to the CFTR gene." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. How many transcripts can the CFTR gene generate?" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of transcripts: 11\n" ] } ], "source": [ "# Open the GTF file and loop throught it.\n", "# To find the number of transcripts, check the 3rd element for \"transcript\" while the specific gene exists\n", "# Can also be solved with bash using: \n", "# cat Homo_sapiens.GRCh38.93.gtf | grep ENSG00000001626 | grep \"\\ttranscript\\t\" | wc -l\n", "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "# Find number of transcripts\n", "transcripts = 0\n", "for line in g:\n", " if not line.startswith(\"#\"):\n", " entries = line.split(\"\\t\")\n", " if \"ENSG00000001626\" in line and entries[2] == \"transcript\":\n", " transcripts += 1\n", "print(\"Number of transcripts: \", transcripts)\n", "g.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. Which of these transcripts is the longest transcript in nucleotides?" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Longest transcript: \"ENST00000003084\" \n", "Length: 188703\n" ] } ], "source": [ "# Open the GTF file and loop throught it.\n", "# Find all the instances of transcripts for specific gene\n", "# For each of them, extract the transcript id (column 8) and the length (column 4 - column 3 + 1)\n", "# If longer than previous longest, store it and continue\n", "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "# Find longest transcript\n", "longest_transcript = (0, '') # update the list with length and transcript ID with a for loop\n", "for line in g:\n", " if not line.startswith(\"#\"):\n", " entries = line.strip().split(\"\\t\")\n", " if \"ENSG00000001626\" in line and entries[2] == \"transcript\":\n", " attribute = entries[8].split(\"; \")\n", " transcriptID = attribute[2].strip().split(' ')[1]\n", " length = int(entries[4]) - int(entries[3]) + 1\n", " if length > longest_transcript[0]:\n", " longest_transcript = (length, transcriptID)\n", " \n", "print(\"Longest transcript: \", longest_transcript[1], \"\\nLength: \", longest_transcript[0])\n", "\n", "g.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. Fetch the DNA sequence for that transcript." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\"ENST00000003084\" 117479963 117668665\n", "AATTGGAAGC ...\n" ] } ], "source": [ "# Get the longest transcript from the previous step \n", "# and extract the start and end position from the columns 3,4\n", "# Save the whole sequence from the fasta file to a variable\n", "# and get the start-1 to end from this variable \n", "# (incl. its start and end positions)\n", "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "for line in g:\n", " if not line.startswith(\"#\"):\n", " entries = line.strip().split(\"\\t\")\n", " if \"ENSG00000001626\" in line and entries[2] == \"transcript\":\n", " attribute = entries[8].split(\"; \")\n", " transcriptID = attribute[2].strip().split(' ')[1]\n", " if '\"ENST00000003084\"' == transcriptID:\n", " start = int(entries[3])\n", " end = int(entries[4])\n", " print(transcriptID, start, end)\n", "\n", "g.close()\n", "\n", "# Extract the longest transcript sequence from the genome fasta file\n", "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "\n", "# Store the DNA sequence of chromosome 7 in a variable\n", "seqList = []\n", "for line in f:\n", " if not line.startswith(\">\"):\n", " seqList.append(line.strip())\n", "chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n", "\n", "f.close()\n", "\n", "# Store DNA sequence of longest transcript in variable\n", "cftr = chr7[start-1:end] \n", "print(cftr[:10], \"...\")\n", "\n", "d = open(\"CFTR_longest_transcript.txt\", 'w')\n", "d.write(cftr)\n", "d.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check the sequence with the package utils.check_answers (note that I moved the directory \"utils\" to the directory where the Jupyter notebook is)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The sequences are matching!\n" ] } ], "source": [ "from utils import check_answers\n", "check_answers.ex3(\"CFTR_longest_transcript.txt\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4. Fetch the DNA sequences of all exons for that transcript, spliced together to one sequence." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AATTGGAAGC ...\n" ] } ], "source": [ "# Open the GTF file and loop throught it. Get the same as in the previous question\n", "# but store all of the transcripts in a set\n", "# Create a variable from the other file, like in the previous question\n", "# Loop through the set in the other file and pick the positions of the exons\n", "# Get the longest transcript and its exons\n", "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "exons = []\n", "\n", "for line in g:\n", " if not line.startswith(\"#\"):\n", " entries = line.strip().split(\"\\t\")\n", " if \"ENSG00000001626\" in line and entries[2] == \"exon\":\n", " attribute = entries[8].split(\"; \")\n", " transcriptID = attribute[2].strip().split(' ')[1]\n", " if '\"ENST00000003084\"' == transcriptID:\n", " start = int(entries[3])\n", " end = int(entries[4])\n", " exons.append((start, end))\n", "\n", "g.close()\n", "\n", "# Extract all exons of the longest transcript sequence from the genome fasta file\n", "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "\n", "seqList = [] # store the DNA sequence of chromosome 7 in a list\n", "for line in f:\n", " if not line.startswith(\">\"):\n", " seqList.append(line.strip())\n", "chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n", "\n", "f.close()\n", "\n", "mRNA = \"\" # add exon sequences to string\n", "for exon in exons:\n", " mRNA += chr7[exon[0]-1:exon[1]]\n", "\n", "print(mRNA[:10], \"...\")\n", "\n", "d = open(\"CFTR_longest_transcript_exons.txt\", 'w')\n", "d.write(mRNA)\n", "d.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check the sequence with the package utils.check_answers" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The sequences are matching!\n" ] } ], "source": [ "# no need to import again as it was imported in a cell above\n", "from utils import check_answers\n", "check_answers.ex4(\"CFTR_longest_transcript_exons.txt\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 5. What are the start positions and sequences of the start_codon and stop_codon for that transcript?" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The start codon is ATG and starts at position 117480095\n", "The stop codon is TAG and starts at position 117667106\n" ] } ], "source": [ "# Get the longest transcript incl. its start and stop codon positions\n", "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "for line in g:\n", " if not line.startswith(\"#\"):\n", " entries = line.strip().split(\"\\t\")\n", " if \"ENSG00000001626\" in line and entries[2] == 'start_codon' or entries[2] == 'stop_codon': # check for start or stop codons\n", " attribute = entries[8].split(\"; \")\n", " transcriptID = attribute[2].strip().split(' ')[1]\n", " if '\"ENST00000003084\"' == transcriptID:\n", " if entries[2] == 'start_codon':\n", " start_codon = int(entries[3]) # save start codon\n", " elif entries[2] == 'stop_codon':\n", " stop_codon = int(entries[3]) # save stop codon\n", "\n", "g.close()\n", "\n", "# Extract all start and stop codon of the longest transcript sequence from the genome fasta file\n", "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "\n", "seqList = [] # store the DNA sequence of chromosome 7 in a list\n", "for line in f:\n", " if not line.startswith(\">\"):\n", " seqList.append(line.strip())\n", "chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n", "\n", "f.close()\n", "\n", "if chr7[start_codon-1:start_codon+2] == \"ATG\":\n", " print(\"The start codon is\", chr7[start_codon-1:start_codon+2], \"and starts at position\", start_codon)\n", "else:\n", " print(\"The start codon is not ATG\")\n", "\n", "if chr7[stop_codon-1:stop_codon+2] == \"TAG\" or chr7[stop_codon-1:stop_codon+2] == \"TAA\" or chr7[stop_codon-1:stop_codon+2] == \"TGA\":\n", " print(\"The stop codon is\", chr7[stop_codon-1:stop_codon+2], \"and starts at position\", stop_codon)\n", "else:\n", " print(\"The stop codon does not correspond to a proper stop codon\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6. Translate the above sequence of all exons into amino acids, using an implementation of the translation table from the utils.rna package" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The first exon has the positions (117479963, 117480147)\n", "The start codon has the position 117480095\n", "MQRSPLEKAS ...\n" ] } ], "source": [ "# Get the longest transcript and its exons, as well as the start codon\n", "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "exons = []\n", "\n", "for line in g:\n", " if not line.startswith(\"#\"):\n", " entries = line.strip().split(\"\\t\")\n", " if \"ENSG00000001626\" in line:\n", " if entries[2] == \"exon\":\n", " attribute = entries[8].split(\"; \")\n", " transcriptID = attribute[2].strip().split(' ')[1]\n", " if '\"ENST00000003084\"' == transcriptID:\n", " start = int(entries[3])\n", " end = int(entries[4])\n", " exons.append((start, end))\n", " elif entries[2] == 'start_codon':\n", " attribute = entries[8].split(\"; \")\n", " transcriptID = attribute[2].strip().split(' ')[1]\n", " if '\"ENST00000003084\"' == transcriptID:\n", " start_codon = int(entries[3])\n", "\n", "print(\"The first exon has the positions\", exons[0])\n", "print(\"The start codon has the position\", start_codon)\n", "g.close()\n", "\n", "# Extract all exons of the longest transcript sequence from the genome fasta file\n", "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "\n", "seqList = [] # store the DNA sequence of chromosome 7 in a list\n", "for line in f:\n", " if not line.startswith(\">\"):\n", " seqList.append(line.strip())\n", "chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n", "\n", "f.close()\n", "\n", "mRNA = \"\" # add exon sequences to string\n", "for exon in exons:\n", " mRNA += chr7[exon[0]-1:exon[1]]\n", "\n", "# Translate to aminoacids, from the correct start position on\n", "if chr7[start_codon-1:start_codon+2] != \"ATG\":\n", " print(\"The start codon is not ATG\")\n", "else:\n", " start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon\n", "\n", "from utils.rna import translate_dna\n", "aminoacids = translate_dna(mRNA[start:]) # translate the mRNA sequence from the start codon on\n", "print(aminoacids[:10], \"...\")\n", "\n", "a = open(\"CFTR_longest_transcript_aminoacids.txt\", 'w')\n", "a.write(aminoacids)\n", "a.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check the sequence with the package utils.check_answers" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The sequences are matching!\n" ] } ], "source": [ "check_answers.ex6(\"CFTR_longest_transcript_aminoacids.txt\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Use the python code you have written to solve the tasks above to determine which of five patients are carrying a mutation in the CFTR gene that causes a truncated protein." ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The sequence of patient 3 has a different length than the reference genome sequence.\n" ] } ], "source": [ "# Get the longest transcript and its exons, as well as the start codon\n", "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "exons = []\n", "\n", "for line in g:\n", " if not line.startswith(\"#\"):\n", " entries = line.strip().split(\"\\t\")\n", " if \"ENSG00000001626\" in line:\n", " if entries[2] == \"exon\":\n", " attribute = entries[8].split(\"; \")\n", " transcriptID = attribute[2].strip().split(' ')[1]\n", " if '\"ENST00000003084\"' == transcriptID:\n", " start = int(entries[3])\n", " end = int(entries[4])\n", " exons.append((start, end))\n", " elif entries[2] == 'start_codon':\n", " attribute = entries[8].split(\"; \")\n", " transcriptID = attribute[2].strip().split(' ')[1]\n", " if '\"ENST00000003084\"' == transcriptID:\n", " start_codon = int(entries[3])\n", "\n", "g.close()\n", "\n", "# Extract all exons of the longest transcript sequence from the genome fasta file\n", "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "\n", "seqList = [] # store the DNA sequence of chromosome 7 in a list\n", "for line in f:\n", " if not line.startswith(\">\"):\n", " seqList.append(line.strip())\n", "chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n", "\n", "f.close()\n", "\n", "mRNA = \"\" # add exon sequences to string\n", "for exon in exons:\n", " mRNA += chr7[exon[0]-1:exon[1]]\n", "\n", "# Translate to aminoacids, from the correct start position on\n", "if chr7[start_codon-1:start_codon+2] != \"ATG\":\n", " print(\"The start codon is not ATG\")\n", "else:\n", " start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon\n", "\n", "from utils.rna import translate_dna\n", "aminoacids = translate_dna(mRNA[start:]) # translate the mRNA sequence from the start codon on\n", "\n", "i = 1 # index for loop through patient fasta files\n", "while i < 6:\n", " p = open(\"Patient\" + str(i) + \".fa\", 'r')\n", " patientSeqList = [] # store the DNA sequence of the patient in a list\n", " for line in p:\n", " if not line.startswith(\">\"):\n", " patientSeqList.append(line.strip())\n", " patientSeq = \"\".join(patientSeqList) # construct one sequence for the currently analyzed patient\n", "\n", " patientmRNA = \"\" # append exon sequences to string. Works only for small strings.\n", " for exon in exons:\n", " patientmRNA += patientSeq[exon[0]-1:exon[1]]\n", "\n", " # Translate to aminoacids\n", " patient_aminoacids = translate_dna(patientmRNA[start:]) # translate the mRNA sequence from the start codon on\n", " if not len(patient_aminoacids) == len(aminoacids):\n", " print(\"The sequence of patient \", str(i), \" has a different length than the reference genome sequence.\")\n", "\n", " p.close()\n", " i += 1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Extra task: Use BioPython to parse the fasta file and to translate DNA nucleotides into amino acids." ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The sequence of patient 3 has a different length than the reference genome sequence.\n" ] } ], "source": [ "from Bio import SeqIO\n", "from Bio.Seq import Seq\n", "from Bio.Alphabet import IUPAC\n", "\n", "# Get the longest transcript and its exons, as well as the start codon\n", "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "exons = []\n", "\n", "for line in g:\n", " if not line.startswith(\"#\"):\n", " entries = line.strip().split(\"\\t\")\n", " if \"ENSG00000001626\" in line:\n", " if entries[2] == \"exon\":\n", " attribute = entries[8].split(\"; \")\n", " transcriptID = attribute[2].strip().split(' ')[1]\n", " if '\"ENST00000003084\"' == transcriptID:\n", " start = int(entries[3])\n", " end = int(entries[4])\n", " exons.append((start, end))\n", " elif entries[2] == 'start_codon':\n", " attribute = entries[8].split(\"; \")\n", " transcriptID = attribute[2].strip().split(' ')[1]\n", " if '\"ENST00000003084\"' == transcriptID:\n", " start_codon = int(entries[3])\n", "\n", "g.close()\n", "\n", "\n", "# Extract all exons of the longest transcript sequence from the genome fasta file\n", "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "\n", "records = list(SeqIO.parse(f, \"fasta\")) # BioPython\n", "if records[0].id == \"7\": # BioPython\n", " chr7 = records[0].seq # BioPython\n", "\n", "f.close()\n", "\n", "\n", "mRNA = \"\" # add exon sequences to string\n", "for exon in exons:\n", " mRNA += chr7[exon[0]-1:exon[1]]\n", "\n", "# Translate to aminoacids, from the correct start position on\n", "if chr7[start_codon-1:start_codon+2] != \"ATG\":\n", " print(\"The start codon is not ATG\")\n", "else:\n", " start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon\n", "\n", "aminoacids = mRNA[start:].translate(to_stop=True) # BioPython\n", "\n", "i = 1 # index for loop through patient fasta files\n", "while i < 6:\n", " p = \"Patient\" + str(i) + \".fa\"\n", " patientRecords = list(SeqIO.parse(p, \"fasta\")) # BioPython\n", " if patientRecords[0].id == \"7\": # BioPython\n", " patientSeq = patientRecords[0].seq # BioPython\n", "\n", " patientmRNA = \"\" # append exon sequences to string. Works only for small strings.\n", " for exon in exons:\n", " patientmRNA += patientSeq[exon[0]-1:exon[1]]\n", "\n", " # Translate to aminoacids\n", " patient_aminoacids = patientmRNA[start:].translate(to_stop=True) # BioPython\n", " \n", " if not len(patient_aminoacids) == len(aminoacids):\n", " print(\"The sequence of patient \", str(i), \" has a different length than the reference genome sequence.\")\n", "\n", " i += 1" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }