{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from __future__ import division, print_function\n", "import dendropy\n", "from dendropy.interop import genbank" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting the data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def get_ebov_2014_sources():\n", " #EBOV_2014\n", " #yield 'EBOV_2014', genbank.GenBankDna(id_range=(233036, 233118), prefix='KM')\n", " yield 'EBOV_2014', genbank.GenBankDna(id_range=(34549, 34563), prefix='KM0')\n", " \n", "def get_other_ebov_sources():\n", " #EBOV other\n", " yield 'EBOV_1976', genbank.GenBankDna(ids=['AF272001', 'KC242801'])\n", " yield 'EBOV_1995', genbank.GenBankDna(ids=['KC242796', 'KC242799'])\n", " yield 'EBOV_2007', genbank.GenBankDna(id_range=(84, 90), prefix='KC2427')\n", " \n", "def get_other_ebolavirus_sources():\n", " #BDBV\n", " yield 'BDBV', genbank.GenBankDna(id_range=(3, 6), prefix='KC54539')\n", " yield 'BDBV', genbank.GenBankDna(ids=['FJ217161'])\n", "\n", " #RESTV\n", " yield 'RESTV', genbank.GenBankDna(ids=['AB050936', 'JX477165', 'JX477166', 'FJ621583', 'FJ621584', 'FJ621585']) \n", "\n", " #SUDV\n", " yield 'SUDV', genbank.GenBankDna(ids=['KC242783', 'AY729654', 'EU338380',\n", " 'JN638998', 'FJ968794', 'KC589025', 'JN638998'])\n", " #yield 'SUDV', genbank.GenBankDna(id_range=(89, 92), prefix='KC5453') \n", "\n", " #TAFV\n", " yield 'TAFV', genbank.GenBankDna(ids=['FJ217162'])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "other = open('other.fasta', 'w')\n", "sampled = open('sample.fasta', 'w')\n", "\n", "for species, recs in get_other_ebolavirus_sources():\n", " char_mat = recs.generate_char_matrix(taxon_set=dendropy.TaxonSet(),\n", " gb_to_taxon_func=lambda gb: dendropy.Taxon(label='%s_%s' % (species, gb.accession)))\n", " char_mat.write_to_stream(other, 'fasta')\n", " char_mat.write_to_stream(sampled, 'fasta')\n", "other.close()\n", "ebov_2014 = open('ebov_2014.fasta', 'w')\n", "ebov = open('ebov.fasta', 'w')\n", "for species, recs in get_ebov_2014_sources():\n", " char_mat = recs.generate_char_matrix(taxon_set=dendropy.TaxonSet(),\n", " gb_to_taxon_func=lambda gb: dendropy.Taxon(label='EBOV_2014_%s' % gb.accession))\n", " char_mat.write_to_stream(ebov_2014, 'fasta')\n", " char_mat.write_to_stream(sampled, 'fasta')\n", " char_mat.write_to_stream(ebov, 'fasta')\n", "ebov_2014.close()\n", "\n", "ebov_2007 = open('ebov_2007.fasta', 'w')\n", "for species, recs in get_other_ebov_sources():\n", " char_mat = recs.generate_char_matrix(taxon_set=dendropy.TaxonSet(),\n", " gb_to_taxon_func=lambda gb: dendropy.Taxon(label='%s_%s' % (species, gb.accession)))\n", " char_mat.write_to_stream(ebov, 'fasta')\n", " char_mat.write_to_stream(sampled, 'fasta')\n", " if species == 'EBOV_2007':\n", " char_mat.write_to_stream(ebov_2007, 'fasta')\n", "\n", "ebov.close()\n", "ebov_2007.close()\n", "sampled.close()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Genes" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "my_genes = ['NP', 'L', 'VP35', 'VP40']\n", "\n", "def dump_genes(species, recs, g_dls, p_hdls):\n", " for rec in recs:\n", "\n", " for feature in rec.feature_table:\n", " if feature.key == 'CDS':\n", " gene_name = None\n", " for qual in feature.qualifiers:\n", " if qual.name == 'gene':\n", " if qual.value in my_genes:\n", " gene_name = qual.value\n", " elif qual.name == 'translation':\n", " protein_translation = qual.value\n", " if gene_name is not None:\n", " locs = feature.location.split('.')\n", " start, end = int(locs[0]), int(locs[-1])\n", " g_hdls[gene_name].write('>%s_%s\\n' % (species, rec.accession))\n", " p_hdls[gene_name].write('>%s_%s\\n' % (species, rec.accession))\n", " g_hdls[gene_name].write('%s\\n' % rec.sequence_text[start - 1 : end])\n", " p_hdls[gene_name].write('%s\\n' % protein_translation)\n", "\n", "g_hdls = {}\n", "p_hdls = {}\n", "for gene in my_genes:\n", " g_hdls[gene] = open('%s.fasta' % gene, 'w')\n", " p_hdls[gene] = open('%s_P.fasta' % gene, 'w')\n", "for species, recs in get_other_ebolavirus_sources():\n", " if species in ['RESTV', 'SUDV']:\n", " dump_genes(species, recs, g_hdls, p_hdls)\n", "for gene in my_genes:\n", " g_hdls[gene].close()\n", " p_hdls[gene].close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Genome exploration" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def describe_seqs(seqs):\n", " print('Number of sequences: %d' % len(seqs.taxon_set))\n", " print('First 10 taxon sets: %s' % ' '.join([taxon.label for taxon in seqs.taxon_set[:10]]))\n", " lens = []\n", " for tax, seq in seqs.items():\n", " lens.append(len([x for x in seq.symbols_as_list() if x != '-']))\n", " print('Genome length: min %d, mean %.1f, max %d' % (min(lens), sum(lens) / len(lens), max(lens)))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "EBOV\n", "Number of sequences: 26\n", "First 10 taxon sets: EBOV_2014_KM034549 EBOV_2014_KM034550 EBOV_2014_KM034551 EBOV_2014_KM034552 EBOV_2014_KM034553 EBOV_2014_KM034554 EBOV_2014_KM034555 EBOV_2014_KM034556 EBOV_2014_KM034557 EBOV_2014_KM034558\n", "Genome length: min 18700, mean 18925.2, max 18959\n" ] } ], "source": [ "ebov_seqs = dendropy.DnaCharacterMatrix.get_from_path('ebov.fasta', schema='fasta', data_type='dna')\n", "print('EBOV')\n", "describe_seqs(ebov_seqs)\n", "del ebov_seqs" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ebolavirus sequences\n", "Number of sequences: 18\n", "First 10 taxon sets: BDBV_KC545393 BDBV_KC545394 BDBV_KC545395 BDBV_KC545396 BDBV_FJ217161 RESTV_AB050936 RESTV_JX477165 RESTV_JX477166 RESTV_FJ621583 RESTV_FJ621584\n", "Genome length: min 18796, mean 18892.7, max 18940\n", " TAFV: 1\n", " SUDV: 6\n", " RESTV: 6\n", " BDBV: 5\n" ] } ], "source": [ "print('ebolavirus sequences')\n", "ebolav_seqs = dendropy.DnaCharacterMatrix.get_from_path('other.fasta', schema='fasta', data_type='dna')\n", "describe_seqs(ebolav_seqs)\n", "from collections import defaultdict\n", "species = defaultdict(int)\n", "for taxon in ebolav_seqs.taxon_set:\n", " toks = taxon.label.split('_')\n", " my_species = toks[0]\n", " if my_species == 'EBOV':\n", " ident = '%s (%s)' % (my_species, toks[1])\n", " else:\n", " ident = my_species\n", " species[ident] += 1\n", "for my_species, cnt in species.items():\n", " print(\"%20s: %d\" % (my_species, cnt))\n", "del ebolav_seqs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Genes" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " NP: 2218\n", " VP40: 988\n", " L: 6636\n", " VP35: 990\n" ] } ], "source": [ "import os\n", "gene_length = {}\n", "my_genes = ['NP', 'L', 'VP35', 'VP40']\n", "\n", "for name in my_genes:\n", " gene_name = name.split('.')[0]\n", " seqs = dendropy.DnaCharacterMatrix.get_from_path('%s.fasta' % name, schema='fasta', data_type='dna')\n", " gene_length[gene_name] = []\n", " for tax, seq in seqs.items():\n", " gene_length[gene_name].append(len([x for x in seq.symbols_as_list() if x != '-']))\n", "for gene, lens in gene_length.items():\n", " print ('%6s: %d' % (gene, sum(lens) / len(lens)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }