{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Warmup tasks"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download the data.zip file from http://pcons1.scilifelab.se/misc/download/python2024/data.zip and unzip it. Then rename the folder as `data`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. What is the length of chromosome 7 on the reference sequence?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "159345973\n"
     ]
    }
   ],
   "source": [
    "# Open the reference file (fasta) and loop through it, removing the first line (starts with >)\n",
    "# Sum the length of the lines, removing the newline character\n",
    "with open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r') as fh:\n",
    "    length = 0\n",
    "    for line in fh:\n",
    "        if not line.startswith(\">\"):\n",
    "            length += len(line.strip()) # add the length of the line, without newline character\n",
    "    print(length)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. How many genes are annotated in the GTF file?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "58395\n"
     ]
    }
   ],
   "source": [
    "# Open the GTF file and loop through it.\n",
    "# To find the genes, check the 3rd element of each line and add 1 if the feature is \"gene\"\n",
    "with open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r') as fh:\n",
    "    num_genes = 0\n",
    "    for line in fh:\n",
    "        if not line.startswith(\"#\"):\n",
    "            feature = line.split(\"\\t\")[2]\n",
    "            if feature == \"gene\":\n",
    "                num_genes +=1\n",
    "    print(num_genes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Architect a method"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'gene_id': 'unkown gene', 'gene_version': '14', 'transcript_id': 'ENST00000546407', 'transcript_version': '1', 'gene_name': 'CFTR', 'gene_source': 'ensembl_havana', 'gene_biotype': 'protein_coding', 'transcript_name': 'CFTR-207', 'transcript_source': 'havana', 'transcript_biotype': 'processed_transcript', 'transcript_support_level': '1'}\n",
      "gene_id: unkown gene\n",
      "transcript_id: ENST00000546407\n"
     ]
    }
   ],
   "source": [
    "# helper function to parse attribute\n",
    "# A more robust way to parse the attribute by using a dictionary. \n",
    "# In this case, attribute items will be parsed correctly even when the order of the items are changed.\n",
    "def parse_attribute(attribute):\n",
    "    \"\"\"\n",
    "    Parse the attribute field from the GTF file\n",
    "    Input: string of the attribute field \n",
    "    Output: a dictionary\n",
    "    \"\"\"\n",
    "    li_attr = [x.strip() for x in attribute.split(';') if x.strip()]\n",
    "    d_attr = {}\n",
    "    for attr in li_attr:\n",
    "        key_and_value = attr.split(' ', maxsplit=1)\n",
    "        if (len(key_and_value) < 2):\n",
    "            print(f\"Error: bad data {attr} in attribute {attribute}\")\n",
    "        key = key_and_value[0]\n",
    "        value = key_and_value[1].strip().strip('\"')\n",
    "        d_attr[key] = value\n",
    "    return d_attr\n",
    "\n",
    "# Example usage of the function parse_att\n",
    "attribute = 'gene_id \"unkown gene\"; gene_version \"14\"; transcript_id \"ENST00000546407\"; transcript_version \"1\"; gene_name \"CFTR\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"CFTR-207\"; transcript_source \"havana\"; transcript_biotype \"processed_transcript\"; transcript_support_level \"1\";'\n",
    "d_attr = parse_attribute(attribute)\n",
    "print(d_attr)\n",
    "gene_id = d_attr.get(\"gene_id\", \"\")\n",
    "print(f\"gene_id: {gene_id}\")\n",
    "\n",
    "transcript_id = d_attr.get(\"transcript_id\", \"\")\n",
    "print(f\"transcript_id: {transcript_id}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "All following tasks are related to the CFTR gene."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. How many transcripts can the CFTR gene generate?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of transcripts:  11\n"
     ]
    }
   ],
   "source": [
    "# Open the GTF file and loop through it.\n",
    "# To find the number of transcripts, check the 3rd element for \"transcript\" while the specific gene exists\n",
    "# Can also be solved with bash using: \n",
    "# cat Homo_sapiens.GRCh38.93.gtf | grep ENSG00000001626 | grep \"\\ttranscript\\t\" | wc -l\n",
    "with open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r') as fh:\n",
    "    # Find number of transcripts\n",
    "    num_transcripts = 0\n",
    "    for line in fh:\n",
    "        if not line.startswith(\"#\"):\n",
    "            entries = line.split(\"\\t\")\n",
    "            feature = entries[2]\n",
    "            attribute = entries[8]\n",
    "            if feature == 'transcript':\n",
    "                li_attr = attribute.split(\";\")\n",
    "                gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('\"') # remember to strip the surrounding quote\n",
    "                if gene_id == \"ENSG00000001626\":\n",
    "                    num_transcripts += 1\n",
    "                    \n",
    "    print(\"Number of transcripts: \", num_transcripts)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Which of these transcripts is the longest transcript in nucleotides?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Longest transcript: ENST00000003084\n",
      "Length: 188703\n"
     ]
    }
   ],
   "source": [
    "# Open the GTF file and loop throught it.\n",
    "# Find all the instances of transcripts for specific gene\n",
    "# For each of them, extract the transcript id (column 8) and the length (column 4 - column 3 + 1)\n",
    "# If longer than previous longest, store it and continue\n",
    "with open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r') as fh:\n",
    "    # Find longest transcript\n",
    "    longest_transcript = (0, '') # update the list with length and transcript ID with a for loop\n",
    "    for line in fh:\n",
    "        if not line.startswith(\"#\"):\n",
    "            entries = line.strip().split(\"\\t\")\n",
    "            feature = entries[2]\n",
    "            attribute = entries[8]\n",
    "            if feature == \"transcript\":\n",
    "                li_attr = attribute.split(\";\")\n",
    "                gene_id = li_attr[0].strip().split()[1].strip('\"')\n",
    "                if gene_id == \"ENSG00000001626\":\n",
    "                    transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('\"')\n",
    "                    length = int(entries[4]) - int(entries[3]) + 1\n",
    "                    #print(line)\n",
    "                    if length > longest_transcript[0]:\n",
    "                        longest_transcript = (length, transcript_id)\n",
    "\n",
    "    print(f\"Longest transcript: {longest_transcript[1]}\")\n",
    "    print(f\"Length: {longest_transcript[0]}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. Fetch the DNA sequence for that transcript."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ENST00000003084 117479963 117668665\n",
      "First 10 nucleotides of the longest transcript: AATTGGAAGC\n",
      "The DNA sequence of the longest transcript has been output to the file 'CFTR_longest_transcript.txt'\n"
     ]
    }
   ],
   "source": [
    "# Get the longest transcript from the previous step \n",
    "# and extract the start and end position from the columns 4 and 5\n",
    "# Save the whole sequence from the fasta file to a variable\n",
    "# and get the start-1 to end from this variable \n",
    "# (including its start and end positions)\n",
    "with open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r') as fh:\n",
    "    for line in fh:\n",
    "        if not line.startswith(\"#\"):\n",
    "            entries = line.strip().split(\"\\t\")\n",
    "            feature = entries[2]\n",
    "            attribute = entries[8]\n",
    "            if feature == \"transcript\":\n",
    "                li_attr = attribute.split(\";\")\n",
    "                gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('\"')\n",
    "                if gene_id == \"ENSG00000001626\":\n",
    "                    transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('\"')\n",
    "                    if  transcript_id == \"ENST00000003084\":\n",
    "                        start = int(entries[3])\n",
    "                        end = int(entries[4])\n",
    "                        print(transcript_id, start, end)\n",
    "\n",
    "\n",
    "# Get the full DNA sequence of chromosome 7\n",
    "with open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r') as fh:\n",
    "    # Store the DNA sequence of chromosome 7 in a list when reading lines\n",
    "    seqList = []\n",
    "    for line in fh:\n",
    "        if not line.startswith(\">\"):\n",
    "            seqList.append(line.strip())\n",
    "    seq_chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n",
    "\n",
    "# Store DNA sequence of the longest transcript in a variable\n",
    "seq_CFTR = seq_chr7[start-1:end] \n",
    "print(f\"First 10 nucleotides of the longest transcript: {seq_CFTR[:10]}\")\n",
    "\n",
    "dna_seqfile = \"CFTR_longest_transcript.txt\"\n",
    "with open(dna_seqfile, 'w') as fh:\n",
    "    fh.write(seq_CFTR)\n",
    "    print(f\"The DNA sequence of the longest transcript has been output to the file '{dna_seqfile}'\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check the sequence with the package utils.check_answers (note that I moved the directory \"utils\" to the directory where the Jupyter notebook is)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The sequences are matching!\n"
     ]
    }
   ],
   "source": [
    "from utils import check_answers\n",
    "check_answers.ex3(\"CFTR_longest_transcript.txt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. Fetch the DNA sequences of all exons for that transcript, concatenate into  one sequence."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The first 10 nucleotides for the coding sequence: AATTGGAAGC\n",
      "The coding sequence has been output to the file 'CFTR_longest_transcript_coding_seq.txt'\n"
     ]
    }
   ],
   "source": [
    "# Open the GTF file and loop through it. Get the same as in the previous question\n",
    "# but store all of the transcripts in a set\n",
    "# Create a variable from the other file, like in the previous question\n",
    "# Loop through the set in the other file and pick the positions of the exons\n",
    "# Get the longest transcript and its exons\n",
    "with open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r') as fh:\n",
    "    exons = []\n",
    "    for line in fh:\n",
    "        if not line.startswith(\"#\"):\n",
    "            entries = line.strip().split(\"\\t\")\n",
    "            feature = entries[2]\n",
    "            attribute = entries[8]\n",
    "            if feature == \"exon\":\n",
    "                li_attr = attribute.split(\";\")\n",
    "                gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('\"')\n",
    "                transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('\"')\n",
    "                if gene_id == \"ENSG00000001626\" and transcript_id == \"ENST00000003084\":\n",
    "                    start = int(entries[3])\n",
    "                    end = int(entries[4])\n",
    "                    exons.append((start, end))\n",
    "\n",
    "\n",
    "# Get the full DNA sequence of chromosome 7\n",
    "with open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r') as fh:\n",
    "    seqList = [] # Store the DNA sequence of chromosome 7 in a list when reading lines\n",
    "    for line in fh:\n",
    "        if not line.startswith(\">\"):\n",
    "            seqList.append(line.strip())\n",
    "    seq_chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n",
    "\n",
    "# Get the coding sequence by joining all exon sequences\n",
    "coding_seq = \"\" # add exon sequences to string\n",
    "for exon in exons:\n",
    "    coding_seq += seq_chr7[exon[0]-1 : exon[1]]\n",
    "\n",
    "print(f\"The first 10 nucleotides for the coding sequence: {coding_seq[:10]}\")\n",
    "\n",
    "coding_seqfile = \"CFTR_longest_transcript_coding_seq.txt\"\n",
    "with open(coding_seqfile, 'w') as fh:\n",
    "    fh.write(coding_seq)\n",
    "    print(f\"The coding sequence has been output to the file '{coding_seqfile}'\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check the sequence with the package utils.check_answers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The sequences are matching!\n"
     ]
    }
   ],
   "source": [
    "# no need to import again as it was imported in a cell above\n",
    "from utils import check_answers\n",
    "check_answers.ex4(\"CFTR_longest_transcript_coding_seq.txt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5. What are the start positions and sequences of the start_codon and stop_codon for that transcript?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The start codon is ATG and starts at position 117480095\n",
      "The stop codon is TAG and starts at position 117667106\n"
     ]
    }
   ],
   "source": [
    "# Get the longest transcript including its start and stop codon positions\n",
    "with open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r') as fh:\n",
    "    for line in fh:\n",
    "        if not line.startswith(\"#\"):\n",
    "            entries = line.strip().split(\"\\t\")\n",
    "            feature = entries[2]\n",
    "            attribute = entries[8]\n",
    "            if feature in ['start_codon', 'stop_codon']: # check for start or stop codons\n",
    "                li_attr = attribute.split(\";\")\n",
    "                gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('\"')\n",
    "                transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('\"')\n",
    "                if gene_id == \"ENSG00000001626\" and transcript_id == \"ENST00000003084\":\n",
    "                    if feature == 'start_codon':\n",
    "                        pos_start_codon = int(entries[3]) # save position for the start codon \n",
    "                    elif feature == 'stop_codon':\n",
    "                        pos_stop_codon = int(entries[3]) # save position for the stop codon\n",
    "\n",
    "\n",
    "\n",
    "# Get the full DNA sequence of chromosome 7\n",
    "with open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r') as fh:\n",
    "    seqList = [] # Store the DNA sequence of chromosome 7 in a list when reading lines\n",
    "    for line in fh:\n",
    "        if not line.startswith(\">\"):\n",
    "            seqList.append(line.strip())\n",
    "    seq_chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n",
    "\n",
    "\n",
    "if seq_chr7[pos_start_codon-1:pos_start_codon+2] == \"ATG\":\n",
    "    print(\"The start codon is\", chr7[pos_start_codon-1:pos_start_codon+2], \"and starts at position\", pos_start_codon)\n",
    "else:\n",
    "    print(\"Warning! The start codon is not ATG\")\n",
    "\n",
    "if seq_chr7[pos_stop_codon-1:pos_stop_codon+2] in [\"TAG\", \"TAA\", \"TGA\"]:\n",
    "    print(\"The stop codon is\", chr7[pos_stop_codon-1:pos_stop_codon+2], \"and starts at position\", pos_stop_codon)\n",
    "else:\n",
    "    print(\"Warning! The stop codon does not correspond to a proper stop codon\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6. Translate the above sequence of all exons into amino acids, using an implementation of the translation table from the utils.rna package\n",
    "\n",
    "strand is all positive for this transcript. For more robust code, you also need to think about the translation direction."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of exons for this transcript:  27\n",
      "The first exon has the positions (117479963, 117480147)\n",
      "The start codon has the position 117480095\n",
      "First 10 amino acids of the protein CFTR: MQRSPLEKAS\n"
     ]
    }
   ],
   "source": [
    "# Get the longest transcript and its exons, as well as the the position of the start codon\n",
    "with open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r') as fh:\n",
    "    exons = []\n",
    "    for line in fh:\n",
    "        if not line.startswith(\"#\"):\n",
    "            entries = line.strip().split(\"\\t\")\n",
    "            strand = entries[6]\n",
    "            feature = entries[2]\n",
    "            attribute = entries[8]\n",
    "            li_attr = attribute.split(\";\")\n",
    "            gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('\"')\n",
    "            transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('\"')\n",
    "            \n",
    "            if gene_id == \"ENSG00000001626\" and transcript_id == \"ENST00000003084\":\n",
    "                # print('strand=',strand)\n",
    "                if feature == \"exon\":\n",
    "                    start = int(entries[3])\n",
    "                    end = int(entries[4])\n",
    "                    exons.append((start, end))\n",
    "                elif feature == 'start_codon':\n",
    "                    pos_start_codon = int(entries[3])\n",
    "                    \n",
    "print(\"Number of exons for this transcript: \", len(exons))\n",
    "print(\"The first exon has the positions\", exons[0])\n",
    "print(\"The start codon has the position\", pos_start_codon)\n",
    "\n",
    "\n",
    "# Get the full DNA sequence of chromosome 7\n",
    "with open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r') as fh:\n",
    "    seqList = [] # Store the DNA sequence of chromosome 7 in a list when reading lines\n",
    "    for line in fh:\n",
    "        if not line.startswith(\">\"):\n",
    "            seqList.append(line.strip())\n",
    "    seq_chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n",
    "\n",
    "# Get the coding sequence by joining all exon sequences\n",
    "coding_seq = \"\" # add exon sequences to string\n",
    "for exon in exons:\n",
    "    coding_seq += seq_chr7[exon[0]-1:exon[1]]\n",
    "\n",
    "# Translate to aminoacids, from the correct start position on\n",
    "if seq_chr7[pos_start_codon-1:pos_start_codon+2] != \"ATG\":\n",
    "    print(\"The start codon is not ATG\")\n",
    "else:\n",
    "    start = pos_start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start\n",
    "                                      # codon position and the start position of the first exon\n",
    "\n",
    "from utils.rna import translate_dna\n",
    "protein_CFTR = translate_dna(coding_seq[start:]) # translate the mRNA sequence from the start codon on\n",
    "print(f\"First 10 amino acids of the protein CFTR: {protein_CFTR[:10]}\")\n",
    "\n",
    "with open(\"CFTR_longest_transcript_aminoacids.txt\", 'w') as fh:\n",
    "    fh.write(protein_CFTR)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check the sequence with the package utils.check_answers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The sequences are matching!\n"
     ]
    }
   ],
   "source": [
    "check_answers.ex6(\"CFTR_longest_transcript_aminoacids.txt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Use the python code you have written to solve the tasks above to determine which of five patients are carrying a mutation in the CFTR gene that causes a truncated protein."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The sequence of patient 1 has a different length than the reference genome sequence.\n",
      "patient gene length :113\n",
      "reference gene length: 1480\n"
     ]
    }
   ],
   "source": [
    "# Get the longest transcript and its exons, as well as the start codon\n",
    "with open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r') as fh:\n",
    "    exons = []\n",
    "    for line in fh:\n",
    "        if not line.startswith(\"#\"):\n",
    "            entries = line.strip().split(\"\\t\")\n",
    "            strand = entries[6]\n",
    "            feature = entries[2]\n",
    "            attribute = entries[8]\n",
    "            li_attr = attribute.split(\";\")\n",
    "            gene_id = li_attr[0].strip().split()[1].strip('\"')\n",
    "            transcript_id = li_attr[2].strip().split()[1].strip('\"')\n",
    "            \n",
    "            if gene_id == \"ENSG00000001626\" and transcript_id == \"ENST00000003084\":\n",
    "                # print('strand=',strand)\n",
    "                if feature == \"exon\":\n",
    "                    start = int(entries[3])\n",
    "                    end = int(entries[4])\n",
    "                    exons.append((start, end))\n",
    "                elif feature == 'start_codon':\n",
    "                    pos_start_codon = int(entries[3])\n",
    "\n",
    "\n",
    "# Get the full DNA sequence of chromosome 7\n",
    "with open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r') as fh:\n",
    "    seqList = [] # store the DNA sequence of chromosome 7 in a list\n",
    "    for line in fh:\n",
    "        if not line.startswith(\">\"):\n",
    "            seqList.append(line.strip())\n",
    "    seq_chr7 = \"\".join(seqList) # join all DNA fragments from the seqList to one string\n",
    "\n",
    "\n",
    "# Get the coding sequence by joining all exon sequences\n",
    "coding_seq = \"\" # add exon sequences to string\n",
    "for exon in exons:\n",
    "    coding_seq += seq_chr7[exon[0]-1:exon[1]]\n",
    "\n",
    "# Translate to aminoacids, from the correct start position on\n",
    "if seq_chr7[pos_start_codon-1:pos_start_codon+2] != \"ATG\":\n",
    "    print(\"The start codon is not ATG\")\n",
    "else:\n",
    "    start = pos_start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon\n",
    "\n",
    "from utils.rna import translate_dna\n",
    "protein_CFTR = translate_dna(coding_seq[start:]) # translate the mRNA sequence from the start codon on\n",
    "\n",
    "i = 1 # counter for looping through patient fasta files\n",
    "while i < 6:\n",
    "    patient_seqfile = f\"data/Patient{i}.fa\"\n",
    "    with open(patient_seqfile, \"r\") as pf:\n",
    "        patientSeqList = [] # store the DNA sequence of the patient in a list\n",
    "        for line in pf:\n",
    "            if not line.startswith(\">\"):\n",
    "                patientSeqList.append(line.strip())\n",
    "        patientSeq = \"\".join(patientSeqList) # construct one sequence for the currently analyzed patient\n",
    "\n",
    "        patient_coding_seq = \"\" # append exon sequences to string. Works only for small strings.\n",
    "        for exon in exons:\n",
    "            patient_coding_seq += patientSeq[exon[0]-1:exon[1]]\n",
    "\n",
    "        # Translate to amino acids\n",
    "        patient_protein_CFTR = translate_dna(patient_coding_seq[start:]) # translate the coding sequence from the start codon on\n",
    "        if not len(patient_protein_CFTR) == len(protein_CFTR):\n",
    "            print(f\"The sequence of patient {i} has a different length than the reference genome sequence.\")\n",
    "            print(f\"patient gene length :{len(patient_protein_CFTR)}\")\n",
    "            print(f\"reference gene length: {len(protein_CFTR)}\")\n",
    "            \n",
    "    i += 1 # remember to increase the counter by 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extra task: Use BioPython to parse the fasta file and to translate DNA nucleotides into amino acids."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The sequence of patient 1 has a different length than the reference genome sequence.\n",
      "patient gene length :113\n",
      "reference gene length: 1480\n"
     ]
    }
   ],
   "source": [
    "from Bio import SeqIO\n",
    "from Bio.Seq import Seq\n",
    "\n",
    "# Get the longest transcript and its exons, as well as the start codon\n",
    "# In this example, we use the function parse_attribute to parse the field\n",
    "with open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r') as fh:\n",
    "    exons = []\n",
    "    for line in fh:\n",
    "        if not line.startswith(\"#\"):\n",
    "            entries = line.strip().split(\"\\t\")\n",
    "            strand = entries[6]\n",
    "            feature = entries[2]\n",
    "            attribute = entries[8]\n",
    "            d_attr = parse_attribute(attribute)\n",
    "            \n",
    "            gene_id = d_attr.get(\"gene_id\", \"\")\n",
    "            transcript_id = d_attr.get(\"transcript_id\", \"\")\n",
    "                \n",
    "            if gene_id == \"ENSG00000001626\" and transcript_id == \"ENST00000003084\":\n",
    "                # print('strand=',strand)\n",
    "                if feature == \"exon\":\n",
    "                    start = int(entries[3])\n",
    "                    end = int(entries[4])\n",
    "                    exons.append((start, end))\n",
    "                elif feature == 'start_codon':\n",
    "                    pos_start_codon = int(entries[3])\n",
    "\n",
    "# Get the full DNA sequence of chromosome 7\n",
    "with open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r') as fh:\n",
    "    for record in SeqIO.parse(fh, \"fasta\"): # BioPython\n",
    "        if record.id == \"7\":  # BioPython\n",
    "            seq_chr7 = record.seq # BioPython\n",
    "\n",
    "\n",
    "coding_seq = Seq(\"\") # add exon sequences to string\n",
    "for exon in exons:\n",
    "    coding_seq += seq_chr7[exon[0]-1:exon[1]]\n",
    "\n",
    "# Translate to amino acids, from the correct start position on\n",
    "if seq_chr7[pos_start_codon-1:pos_start_codon+2] != \"ATG\":\n",
    "    print(\"Warning! The start codon is not ATG\")\n",
    "else:\n",
    "    start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon\n",
    "\n",
    "# Use the method Seq.translate() to translate the coding sequence to amino acids\n",
    "protein_CFTR = coding_seq[start:].translate(to_stop=True)\n",
    "\n",
    "i = 1 # counter for looping through patient fasta files\n",
    "while i < 6:\n",
    "    patient_seqfile = f\"data/Patient{i}.fa\"\n",
    "    \n",
    "    for record in SeqIO.parse(patient_seqfile, \"fasta\"): # BioPython\n",
    "        if record.id == \"7\":  # BioPython\n",
    "            patient_seq = record.seq # BioPython\n",
    "\n",
    "    patient_coding_seq = \"\" # append exon sequences to string. Works only for small strings.\n",
    "    for exon in exons:\n",
    "        patient_coding_seq += patient_seq[exon[0]-1:exon[1]]\n",
    "\n",
    "    # Translate to aminoacids\n",
    "    patient_protein_CFTR = patient_coding_seq[start:].translate(to_stop=True) # BioPython\n",
    "    \n",
    "    if not len(patient_protein_CFTR) == len(protein_CFTR):\n",
    "        print(f\"The sequence of patient {i} has a different length than the reference genome sequence.\")\n",
    "        print(f\"patient gene length :{len(patient_protein_CFTR)}\")\n",
    "        print(f\"reference gene length: {len(protein_CFTR)}\")\n",
    "        \n",
    "    i += 1 # remember to increase the counter by 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}