{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Warmup tasks"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. What is the length of chromosome 7 on the reference sequence?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "159345973\n"
     ]
    }
   ],
   "source": [
    "# Open the reference file (fasta) and loop through it, removing the first line (starts with >)\n",
    "# Sum the length of the lines, removing the newline character\n",
    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
    "length = 0\n",
    "for line in f:\n",
    "    if not line.startswith(\">\"):\n",
    "        length += len(line.strip()) # add the length of the line, without newline character\n",
    "print(length)\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. How many genes are annotated in the GTF file?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "58395\n"
     ]
    }
   ],
   "source": [
    "# Open the GTF file and loop throught it.\n",
    "# To find the genes, check the 3rd element of each line and add 1 if the feature is \"gene\"\n",
    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
    "genes = 0\n",
    "for line in g:\n",
    "    if not line.startswith(\"#\"):\n",
    "        feature = line.split(\"\\t\")[2]\n",
    "        if feature == \"gene\":\n",
    "            genes +=1\n",
    "print(genes)\n",
    "g.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Architect a method"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "All following tasks are related to the CFTR gene."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. How many transcripts can the CFTR gene generate?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of transcripts:  11\n"
     ]
    }
   ],
   "source": [
    "# Open the GTF file and loop throught it.\n",
    "# To find the number of transcripts, check the 3rd element for \"transcript\" while the specific gene exists\n",
    "# Can also be solved with bash using: \n",
    "# cat Homo_sapiens.GRCh38.93.gtf | grep ENSG00000001626 | grep \"\\ttranscript\\t\" | wc -l\n",
    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
    "\n",
    "# Find number of transcripts\n",
    "transcripts = 0\n",
    "for line in g:\n",
    "    if not line.startswith(\"#\"):\n",
    "        entries = line.split(\"\\t\")\n",
    "        if \"ENSG00000001626\" in line and entries[2] == \"transcript\":\n",
    "            transcripts += 1\n",
    "print(\"Number of transcripts: \", transcripts)\n",
    "g.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Which of these transcripts is the longest transcript in nucleotides?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Longest transcript:  \"ENST00000003084\" \n",
      "Length:  188703\n"
     ]
    }
   ],
   "source": [
    "# Open the GTF file and loop throught it.\n",
    "# Find all the instances of transcripts for specific gene\n",
    "# For each of them, extract the transcript id (column 8) and the length (column 4 - column 3 + 1)\n",
    "# If longer than previous longest, store it and continue\n",
    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
    "\n",
    "# Find longest transcript\n",
    "longest_transcript = (0, '') # update the list with length and transcript ID with a for loop\n",
    "for line in g:\n",
    "    if not line.startswith(\"#\"):\n",
    "        entries = line.strip().split(\"\\t\")\n",
    "        if \"ENSG00000001626\" in line and entries[2] == \"transcript\":\n",
    "            attribute = entries[8].split(\"; \")\n",
    "            transcriptID = attribute[2].strip().split(' ')[1]\n",
    "            length = int(entries[4]) - int(entries[3]) + 1\n",
    "            if length > longest_transcript[0]:\n",
    "                longest_transcript = (length, transcriptID)\n",
    "                                \n",
    "print(\"Longest transcript: \", longest_transcript[1], \"\\nLength: \", longest_transcript[0])\n",
    "\n",
    "g.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. Fetch the DNA sequence for that transcript."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"ENST00000003084\" 117479963 117668665\n",
      "AATTGGAAGC ...\n"
     ]
    }
   ],
   "source": [
    "# Get the longest transcript from the previous step \n",
    "# and extract the start and end position from the columns 3,4\n",
    "# Save the whole sequence from the fasta file to a variable\n",
    "# and get the start-1 to end from this variable \n",
    "# (incl. its start and end positions)\n",
    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
    "\n",
    "for line in g:\n",
    "    if not line.startswith(\"#\"):\n",
    "        entries = line.strip().split(\"\\t\")\n",
    "        if \"ENSG00000001626\" in line and entries[2] == \"transcript\":\n",
    "            attribute = entries[8].split(\"; \")\n",
    "            transcriptID = attribute[2].strip().split(' ')[1]\n",
    "            if '\"ENST00000003084\"' == transcriptID:\n",
    "                start = int(entries[3])\n",
    "                end = int(entries[4])\n",
    "                print(transcriptID, start, end)\n",
    "\n",
    "g.close()\n",
    "\n",
    "# Extract the longest transcript sequence from the genome fasta file\n",
    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
    "\n",
    "# Store the DNA sequence of chromosome 7 in a variable\n",
    "seqList = []\n",
    "for line in f:\n",
    "    if not line.startswith(\">\"):\n",
    "        seqList.append(line.strip())\n",
    "chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n",
    "\n",
    "f.close()\n",
    "\n",
    "# Store DNA sequence of longest transcript in variable\n",
    "cftr = chr7[start-1:end] \n",
    "print(cftr[:10], \"...\")\n",
    "\n",
    "d = open(\"CFTR_longest_transcript.txt\", 'w')\n",
    "d.write(cftr)\n",
    "d.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check the sequence with the package utils.check_answers (note that I moved the directory \"utils\" to the directory where the Jupyter notebook is)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The sequences are matching!\n"
     ]
    }
   ],
   "source": [
    "from utils import check_answers\n",
    "check_answers.ex3(\"CFTR_longest_transcript.txt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. Fetch the DNA sequences of all exons for that transcript, spliced together to one sequence."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AATTGGAAGC ...\n"
     ]
    }
   ],
   "source": [
    "# Open the GTF file and loop throught it. Get the same as in the previous question\n",
    "# but store all of the transcripts in a set\n",
    "# Create a variable from the other file, like in the previous question\n",
    "# Loop through the set in the other file and pick the positions of the exons\n",
    "# Get the longest transcript and its exons\n",
    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
    "\n",
    "exons = []\n",
    "\n",
    "for line in g:\n",
    "    if not line.startswith(\"#\"):\n",
    "        entries = line.strip().split(\"\\t\")\n",
    "        if \"ENSG00000001626\" in line and entries[2] == \"exon\":\n",
    "            attribute = entries[8].split(\"; \")\n",
    "            transcriptID = attribute[2].strip().split(' ')[1]\n",
    "            if '\"ENST00000003084\"' == transcriptID:\n",
    "                start = int(entries[3])\n",
    "                end = int(entries[4])\n",
    "                exons.append((start, end))\n",
    "\n",
    "g.close()\n",
    "\n",
    "# Extract all exons of the longest transcript sequence from the genome fasta file\n",
    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
    "\n",
    "seqList = [] # store the DNA sequence of chromosome 7 in a list\n",
    "for line in f:\n",
    "    if not line.startswith(\">\"):\n",
    "        seqList.append(line.strip())\n",
    "chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n",
    "\n",
    "f.close()\n",
    "\n",
    "mRNA = \"\" # add exon sequences to string\n",
    "for exon in exons:\n",
    "    mRNA += chr7[exon[0]-1:exon[1]]\n",
    "\n",
    "print(mRNA[:10], \"...\")\n",
    "\n",
    "d = open(\"CFTR_longest_transcript_exons.txt\", 'w')\n",
    "d.write(mRNA)\n",
    "d.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check the sequence with the package utils.check_answers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The sequences are matching!\n"
     ]
    }
   ],
   "source": [
    "# no need to import again as it was imported in a cell above\n",
    "from utils import check_answers\n",
    "check_answers.ex4(\"CFTR_longest_transcript_exons.txt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5. What are the start positions and sequences of the start_codon and stop_codon for that transcript?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The start codon is ATG and starts at position 117480095\n",
      "The stop codon is TAG and starts at position 117667106\n"
     ]
    }
   ],
   "source": [
    "# Get the longest transcript incl. its start and stop codon positions\n",
    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
    "\n",
    "for line in g:\n",
    "    if not line.startswith(\"#\"):\n",
    "        entries = line.strip().split(\"\\t\")\n",
    "        if \"ENSG00000001626\" in line and entries[2] == 'start_codon' or entries[2] == 'stop_codon': # check for start or stop codons\n",
    "                attribute = entries[8].split(\"; \")\n",
    "                transcriptID = attribute[2].strip().split(' ')[1]\n",
    "                if '\"ENST00000003084\"' == transcriptID:\n",
    "                    if entries[2] == 'start_codon':\n",
    "                        start_codon = int(entries[3]) # save start codon\n",
    "                    elif entries[2] == 'stop_codon':\n",
    "                        stop_codon = int(entries[3]) # save stop codon\n",
    "\n",
    "g.close()\n",
    "\n",
    "# Extract all start and stop codon of the longest transcript sequence from the genome fasta file\n",
    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
    "\n",
    "seqList = [] # store the DNA sequence of chromosome 7 in a list\n",
    "for line in f:\n",
    "    if not line.startswith(\">\"):\n",
    "        seqList.append(line.strip())\n",
    "chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n",
    "\n",
    "f.close()\n",
    "\n",
    "if chr7[start_codon-1:start_codon+2] == \"ATG\":\n",
    "    print(\"The start codon is\", chr7[start_codon-1:start_codon+2], \"and starts at position\", start_codon)\n",
    "else:\n",
    "    print(\"The start codon is not ATG\")\n",
    "\n",
    "if chr7[stop_codon-1:stop_codon+2] == \"TAG\" or chr7[stop_codon-1:stop_codon+2] == \"TAA\" or chr7[stop_codon-1:stop_codon+2] == \"TGA\":\n",
    "    print(\"The stop codon is\", chr7[stop_codon-1:stop_codon+2], \"and starts at position\", stop_codon)\n",
    "else:\n",
    "    print(\"The stop codon does not correspond to a proper stop codon\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6. Translate the above sequence of all exons into amino acids, using an implementation of the translation table from the utils.rna package"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The first exon has the positions (117479963, 117480147)\n",
      "The start codon has the position 117480095\n",
      "MQRSPLEKAS ...\n"
     ]
    }
   ],
   "source": [
    "# Get the longest transcript and its exons, as well as the start codon\n",
    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
    "\n",
    "exons = []\n",
    "\n",
    "for line in g:\n",
    "    if not line.startswith(\"#\"):\n",
    "        entries = line.strip().split(\"\\t\")\n",
    "        if \"ENSG00000001626\" in line:\n",
    "            if entries[2] == \"exon\":\n",
    "                attribute = entries[8].split(\"; \")\n",
    "                transcriptID = attribute[2].strip().split(' ')[1]\n",
    "                if '\"ENST00000003084\"' == transcriptID:\n",
    "                    start = int(entries[3])\n",
    "                    end = int(entries[4])\n",
    "                    exons.append((start, end))\n",
    "            elif entries[2] == 'start_codon':\n",
    "                attribute = entries[8].split(\"; \")\n",
    "                transcriptID = attribute[2].strip().split(' ')[1]\n",
    "                if '\"ENST00000003084\"' == transcriptID:\n",
    "                    start_codon = int(entries[3])\n",
    "\n",
    "print(\"The first exon has the positions\", exons[0])\n",
    "print(\"The start codon has the position\", start_codon)\n",
    "g.close()\n",
    "\n",
    "# Extract all exons of the longest transcript sequence from the genome fasta file\n",
    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
    "\n",
    "seqList = [] # store the DNA sequence of chromosome 7 in a list\n",
    "for line in f:\n",
    "    if not line.startswith(\">\"):\n",
    "        seqList.append(line.strip())\n",
    "chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n",
    "\n",
    "f.close()\n",
    "\n",
    "mRNA = \"\" # add exon sequences to string\n",
    "for exon in exons:\n",
    "    mRNA += chr7[exon[0]-1:exon[1]]\n",
    "\n",
    "# Translate to aminoacids, from the correct start position on\n",
    "if chr7[start_codon-1:start_codon+2] != \"ATG\":\n",
    "    print(\"The start codon is not ATG\")\n",
    "else:\n",
    "    start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon\n",
    "\n",
    "from utils.rna import translate_dna\n",
    "aminoacids = translate_dna(mRNA[start:]) # translate the mRNA sequence from the start codon on\n",
    "print(aminoacids[:10], \"...\")\n",
    "\n",
    "a = open(\"CFTR_longest_transcript_aminoacids.txt\", 'w')\n",
    "a.write(aminoacids)\n",
    "a.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check the sequence with the package utils.check_answers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The sequences are matching!\n"
     ]
    }
   ],
   "source": [
    "check_answers.ex6(\"CFTR_longest_transcript_aminoacids.txt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Use the python code you have written to solve the tasks above to determine which of five patients are carrying a mutation in the CFTR gene that causes a truncated protein."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The sequence of patient  3  has a different length than the reference genome sequence.\n"
     ]
    }
   ],
   "source": [
    "# Get the longest transcript and its exons, as well as the start codon\n",
    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
    "\n",
    "exons = []\n",
    "\n",
    "for line in g:\n",
    "    if not line.startswith(\"#\"):\n",
    "        entries = line.strip().split(\"\\t\")\n",
    "        if \"ENSG00000001626\" in line:\n",
    "            if entries[2] == \"exon\":\n",
    "                attribute = entries[8].split(\"; \")\n",
    "                transcriptID = attribute[2].strip().split(' ')[1]\n",
    "                if '\"ENST00000003084\"' == transcriptID:\n",
    "                    start = int(entries[3])\n",
    "                    end = int(entries[4])\n",
    "                    exons.append((start, end))\n",
    "            elif entries[2] == 'start_codon':\n",
    "                attribute = entries[8].split(\"; \")\n",
    "                transcriptID = attribute[2].strip().split(' ')[1]\n",
    "                if '\"ENST00000003084\"' == transcriptID:\n",
    "                    start_codon = int(entries[3])\n",
    "\n",
    "g.close()\n",
    "\n",
    "# Extract all exons of the longest transcript sequence from the genome fasta file\n",
    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
    "\n",
    "seqList = [] # store the DNA sequence of chromosome 7 in a list\n",
    "for line in f:\n",
    "    if not line.startswith(\">\"):\n",
    "        seqList.append(line.strip())\n",
    "chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n",
    "\n",
    "f.close()\n",
    "\n",
    "mRNA = \"\" # add exon sequences to string\n",
    "for exon in exons:\n",
    "    mRNA += chr7[exon[0]-1:exon[1]]\n",
    "\n",
    "# Translate to aminoacids, from the correct start position on\n",
    "if chr7[start_codon-1:start_codon+2] != \"ATG\":\n",
    "    print(\"The start codon is not ATG\")\n",
    "else:\n",
    "    start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon\n",
    "\n",
    "from utils.rna import translate_dna\n",
    "aminoacids = translate_dna(mRNA[start:]) # translate the mRNA sequence from the start codon on\n",
    "\n",
    "i = 1 # index for loop through patient fasta files\n",
    "while i < 6:\n",
    "    p = open(\"Patient\" + str(i) + \".fa\", 'r')\n",
    "    patientSeqList = [] # store the DNA sequence of the patient in a list\n",
    "    for line in p:\n",
    "        if not line.startswith(\">\"):\n",
    "            patientSeqList.append(line.strip())\n",
    "    patientSeq = \"\".join(patientSeqList) # construct one sequence for the currently analyzed patient\n",
    "\n",
    "    patientmRNA = \"\" # append exon sequences to string. Works only for small strings.\n",
    "    for exon in exons:\n",
    "        patientmRNA += patientSeq[exon[0]-1:exon[1]]\n",
    "\n",
    "    # Translate to aminoacids\n",
    "    patient_aminoacids = translate_dna(patientmRNA[start:]) # translate the mRNA sequence from the start codon on\n",
    "    if not len(patient_aminoacids) == len(aminoacids):\n",
    "        print(\"The sequence of patient \", str(i), \" has a different length than the reference genome sequence.\")\n",
    "\n",
    "    p.close()\n",
    "    i += 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extra task: Use BioPython to parse the fasta file and to translate DNA nucleotides into amino acids."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The sequence of patient  3  has a different length than the reference genome sequence.\n"
     ]
    }
   ],
   "source": [
    "from Bio import SeqIO\n",
    "from Bio.Seq import Seq\n",
    "from Bio.Alphabet import IUPAC\n",
    "\n",
    "# Get the longest transcript and its exons, as well as the start codon\n",
    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
    "\n",
    "exons = []\n",
    "\n",
    "for line in g:\n",
    "    if not line.startswith(\"#\"):\n",
    "        entries = line.strip().split(\"\\t\")\n",
    "        if \"ENSG00000001626\" in line:\n",
    "            if entries[2] == \"exon\":\n",
    "                attribute = entries[8].split(\"; \")\n",
    "                transcriptID = attribute[2].strip().split(' ')[1]\n",
    "                if '\"ENST00000003084\"' == transcriptID:\n",
    "                    start = int(entries[3])\n",
    "                    end = int(entries[4])\n",
    "                    exons.append((start, end))\n",
    "            elif entries[2] == 'start_codon':\n",
    "                attribute = entries[8].split(\"; \")\n",
    "                transcriptID = attribute[2].strip().split(' ')[1]\n",
    "                if '\"ENST00000003084\"' == transcriptID:\n",
    "                    start_codon = int(entries[3])\n",
    "\n",
    "g.close()\n",
    "\n",
    "\n",
    "# Extract all exons of the longest transcript sequence from the genome fasta file\n",
    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
    "\n",
    "records = list(SeqIO.parse(f, \"fasta\")) # BioPython\n",
    "if records[0].id == \"7\":  # BioPython\n",
    "    chr7 = records[0].seq # BioPython\n",
    "\n",
    "f.close()\n",
    "\n",
    "\n",
    "mRNA = \"\" # add exon sequences to string\n",
    "for exon in exons:\n",
    "    mRNA += chr7[exon[0]-1:exon[1]]\n",
    "\n",
    "# Translate to aminoacids, from the correct start position on\n",
    "if chr7[start_codon-1:start_codon+2] != \"ATG\":\n",
    "    print(\"The start codon is not ATG\")\n",
    "else:\n",
    "    start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon\n",
    "\n",
    "aminoacids = mRNA[start:].translate(to_stop=True) # BioPython\n",
    "\n",
    "i = 1 # index for loop through patient fasta files\n",
    "while i < 6:\n",
    "    p = \"Patient\" + str(i) + \".fa\"\n",
    "    patientRecords = list(SeqIO.parse(p, \"fasta\")) # BioPython\n",
    "    if patientRecords[0].id == \"7\":  # BioPython\n",
    "        patientSeq = patientRecords[0].seq # BioPython\n",
    "\n",
    "    patientmRNA = \"\" # append exon sequences to string. Works only for small strings.\n",
    "    for exon in exons:\n",
    "        patientmRNA += patientSeq[exon[0]-1:exon[1]]\n",
    "\n",
    "    # Translate to aminoacids\n",
    "    patient_aminoacids = patientmRNA[start:].translate(to_stop=True) # BioPython\n",
    "    \n",
    "    if not len(patient_aminoacids) == len(aminoacids):\n",
    "        print(\"The sequence of patient \", str(i), \" has a different length than the reference genome sequence.\")\n",
    "\n",
    "    i += 1"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}