{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import gffutils\n", "import gzip\n", "from Bio import Alphabet, Seq, SeqIO" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Retrieving data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2015-06-26 14:46:59-- ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz\n", " => ‘gambiae.fa.gz’\n", "Resolving ftp.vectorbase.org (ftp.vectorbase.org)... 129.74.255.228\n", "Connecting to ftp.vectorbase.org (ftp.vectorbase.org)|129.74.255.228|:21... connected.\n", "Logging in as anonymous ... Logged in!\n", "==> SYST ... done. ==> PWD ... done.\n", "==> TYPE I ... done. ==> CWD (1) /public_data/organism_data/agambiae/Genome ... done.\n", "==> SIZE agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz ... 81591806\n", "==> PASV ... done. ==> RETR agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz ... done.\n", "Length: 81591806 (78M) (unauthoritative)\n", "\n", "HROMOSOMES-PEST.Aga 67%[=============> ] 52.57M 3.08MB/s eta 12s " ] } ], "source": [ "!rm -rf gambiae.gff.gz ag.db 2>/dev/null\n", "!wget ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz -O gambiae.fa.gz\n", "!wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz -O gambiae.gff.gz" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!rm -f ag.db\n", "\n", "db = gffutils.create_db('gambiae.gff.gz', 'ag.db')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Getting a gene" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "gene_id = 'AGAP004707'" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "gene = db[gene_id]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2L\tVectorBase\tgene\t2358158\t2431617\t.\t+\t.\tID=AGAP004707;biotype=protein_coding\n", "('2L', '+')\n" ] } ], "source": [ "print(gene)\n", "print(gene.seqid, gene.strand)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "chromosome:AgamP3:2L:1:49364325:1 chromosome 2L\n", "SingleLetterAlphabet()\n" ] } ], "source": [ "recs = SeqIO.parse(gzip.open('gambiae.fa.gz'), 'fasta')\n", "for rec in recs:\n", " print(rec.description)\n", " if rec.description.split(':')[2] == gene.seqid:\n", " my_seq = rec.seq\n", " break\n", "print(my_seq.alphabet)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def get_sequence(chrom_seq, CDSs, strand):\n", " seq = Seq.Seq('', alphabet=Alphabet.IUPAC.unambiguous_dna)\n", " for CDS in CDSs:\n", " #FRAME???\n", " my_cds = Seq.Seq(str(my_seq[CDS.start - 1: CDS.end]), alphabet=Alphabet.IUPAC.unambiguous_dna)\n", " seq += my_cds\n", " return seq if strand == '+' else seq.reverse_complement()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AGAP004707-RA\n", "(6357, Seq('ATGACCGAAGACTCCGATTCGATATCTGAGGAAGAACGTAGTTTGTTCCGTCCT...TGA', IUPACUnambiguousDNA()))\n", "(2119, Seq('MTEDSDSISEEERSLFRPFTRESLQAIEARIADEEAKQRELERKRAEGEDEDEG...DV*', HasStopCodon(IUPACProtein(), '*')))\n" ] } ], "source": [ "mRNAs = db.children(gene, featuretype='mRNA')\n", "for mRNA in mRNAs:\n", " print(mRNA.id)\n", " if mRNA.id.endswith('RA'):\n", " break\n", "\n", "CDSs = db.children(mRNA, featuretype='CDS', order_by='start')\n", "gene_seq = get_sequence(my_seq, CDSs, gene.strand)\n", "\n", "print(len(gene_seq), gene_seq)\n", "prot = gene_seq.translate()\n", "print(len(prot), prot)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Reverse strand" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "reverse_transcript_id = 'AGAP004708-RA'" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1992, Seq('ATGGCTGACTTCGATAGTGCCACTAAATGTATCAGAAACATTGAAAAAGAAATT...TAA', IUPACUnambiguousDNA()))\n", "(664, Seq('MADFDSATKCIRNIEKEILLLQSEVLKTREGLGLEDDNVELKKLMEENTRLKHR...KI*', HasStopCodon(IUPACProtein(), '*')))\n" ] } ], "source": [ "reverse_CDSs = db.children(reverse_transcript_id, featuretype='CDS', order_by='start')\n", "reverse_seq = get_sequence(my_seq, reverse_CDSs, '-')\n", "\n", "print(len(reverse_seq), reverse_seq)\n", "reverse_prot = reverse_seq.translate()\n", "print(len(reverse_prot), reverse_prot)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }