{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2015-02-18 14:00:45-- http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz\n", "Resolving www.vectorbase.org (www.vectorbase.org)... 129.74.255.228\n", "Connecting to www.vectorbase.org (www.vectorbase.org)|129.74.255.228|:80... connected.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: https://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz [following]\n", "--2015-02-18 14:00:45-- https://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz\n", "Connecting to www.vectorbase.org (www.vectorbase.org)|129.74.255.228|:443... connected.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: https://www.vectorbase.org/sites/default/files/ftp/downloads/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.gff3.gz [following]\n", "--2015-02-18 14:00:47-- https://www.vectorbase.org/sites/default/files/ftp/downloads/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.gff3.gz\n", "Reusing existing connection to www.vectorbase.org:443.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 2702601 (2.6M) [application/x-gzip]\n", "Saving to: ‘gambiae.gff.gz’\n", "\n", "100%[======================================>] 2,702,601 1.46MB/s in 1.8s \n", "\n", "2015-02-18 14:00:49 (1.46 MB/s) - ‘gambiae.gff.gz’ saved [2702601/2702601]\n", "\n" ] } ], "source": [ "!rm -rf gambiae.g?f.gz ag.db 2>/dev/null\n", "#!wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gtfgz -O gambiae.gtf.gz\n", "!wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz -O gambiae.gff.gz" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import gffutils\n", "import sqlite3" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!rm -f ag.db" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "try:\n", " db = gffutils.create_db('gambiae.gff.gz', 'ag.db')\n", "except sqlite3.OperationalError:\n", " db = gffutils.FeatureDB('ag.db')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['CDS', 'RNase_P_RNA', 'SRP_RNA', 'contig', 'exon', 'five_prime_UTR', 'gene', 'mRNA', 'miRNA', 'misc_RNA', 'pseudogene', 'rRNA', 'snRNA', 'snoRNA', 'tRNA', 'tRNA_pseudogene', 'three_prime_UTR']\n", "('CDS', 62408)\n", "('RNase_P_RNA', 1)\n", "('SRP_RNA', 3)\n", "('contig', 8)\n", "('exon', 66485)\n", "('five_prime_UTR', 10520)\n", "('gene', 13624)\n", "('mRNA', 14697)\n", "('miRNA', 187)\n", "('misc_RNA', 10)\n", "('pseudogene', 5)\n", "('rRNA', 53)\n", "('snRNA', 38)\n", "('snoRNA', 12)\n", "('tRNA', 463)\n", "('tRNA_pseudogene', 9)\n", "('three_prime_UTR', 7281)\n" ] } ], "source": [ "print(list(db.featuretypes()))\n", "for feat_type in db.featuretypes():\n", " print(feat_type, db.count_features_of_type(feat_type))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2L\tVectorBase\tcontig\t1\t49364325\t.\t.\t.\ttranslation_table=1;topology=linear;ID=2L;molecule_type=dsDNA;localization=chromosomal\n", "3R\tVectorBase\tcontig\t1\t53200684\t.\t.\t.\ttranslation_table=1;topology=linear;ID=3R;molecule_type=dsDNA;localization=chromosomal\n", "UNKN\tVectorBase\tcontig\t1\t42389979\t.\t.\t.\ttranslation_table=1;topology=linear;ID=UNKN;molecule_type=dsDNA;localization=chromosomal\n", "X\tVectorBase\tcontig\t1\t24393108\t.\t.\t.\ttranslation_table=1;topology=linear;ID=X;molecule_type=dsDNA;localization=chromosomal\n", "Y_unplaced\tVectorBase\tcontig\t1\t237045\t.\t.\t.\ttranslation_table=1;topology=linear;ID=Y_unplaced;molecule_type=dsDNA;localization=chromosomal\n", "Mt\tVectorBase\tcontig\t1\t15363\t.\t.\t.\ttranslation_table=1;topology=linear;ID=Mt;molecule_type=dsDNA;localization=chromosomal\n", "2R\tVectorBase\tcontig\t1\t61545105\t.\t.\t.\ttranslation_table=1;topology=linear;ID=2R;molecule_type=dsDNA;localization=chromosomal\n", "3L\tVectorBase\tcontig\t1\t41963435\t.\t.\t.\ttranslation_table=1;topology=linear;ID=3L;molecule_type=dsDNA;localization=chromosomal\n" ] } ], "source": [ "for contig in db.features_of_type('contig'):\n", " print(contig)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "contig 2L, number of genes 3105\n", "contig 3R, number of genes 2763\n", "contig UNKN, number of genes 567\n", "contig X, number of genes 1097\n", "contig Y_unplaced, number of genes 0\n", "contig Mt, number of genes 37\n", "contig 2R, number of genes 3834\n", "contig 3L, number of genes 2221\n", "Max number of exons: AGAP001660 (67)\n", "Max span: AGAP006656 (365621)\n", "defaultdict(, {0: 781, 1: 11595, 2: 910, 3: 211, 4: 74, 5: 27, 6: 9, 7: 5, 8: 4, 9: 1, 10: 1, 11: 3, 12: 1, 13: 1, 20: 1})\n", "defaultdict(, {1: 2019, 2: 3359, 3: 2838, 4: 2091, 5: 1411, 6: 1039, 7: 718, 8: 454, 9: 419, 10: 298, 11: 202, 12: 159, 13: 106, 14: 65, 15: 65, 16: 45, 17: 53, 18: 22, 19: 28, 20: 19, 21: 9, 22: 7, 23: 6, 24: 6, 25: 9, 26: 3, 27: 2, 28: 5, 29: 4, 30: 5, 31: 5, 32: 1, 33: 1, 34: 1, 42: 1, 49: 1, 50: 1, 67: 1})\n" ] } ], "source": [ "from collections import defaultdict\n", "num_mRNAs = defaultdict(int)\n", "num_exons = defaultdict(int)\n", "max_exons = 0\n", "max_span = 0\n", "for contig in db.features_of_type('contig'):\n", " cnt = 0\n", " for gene in db.region((contig.seqid, contig.start, contig.end), featuretype='gene'):\n", " cnt += 1\n", " span = abs(gene.start - gene.end) # strand\n", " if span > max_span:\n", " max_span = span\n", " max_span_gene = gene\n", " my_mRNAs = list(db.children(gene, featuretype='mRNA'))\n", " num_mRNAs[len(my_mRNAs)] += 1\n", " if len(my_mRNAs) == 0:\n", " exon_check = [gene]\n", " else:\n", " exon_check = my_mRNAs\n", " for check in exon_check:\n", " num_exons[len(my_exons)] += 1\n", " if len(my_exons) > max_exons:\n", " max_exons = len(my_exons)\n", " max_exons_gene = gene\n", " print('contig %s, number of genes %d' % (contig.seqid, cnt))\n", "print('Max number of exons: %s (%d)' % (max_exons_gene.id, max_exons))\n", "print('Max span: %s (%d)' % (max_span_gene.id, max_span))\n", "print(num_mRNAs)\n", "print(num_exons)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }