{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from __future__ import print_function\n", "import dendropy\n", "from dendropy import popgenstat" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Genes" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n", "from collections import OrderedDict\n", "genes_species = OrderedDict()\n", "my_species = ['RESTV', 'SUDV']\n", "my_genes = ['NP', 'L', 'VP35', 'VP40']\n", "\n", "for name in my_genes:\n", " gene_name = name.split('.')[0]\n", " char_mat = dendropy.DnaCharacterMatrix.get_from_path('%s_align.fasta' % name, 'fasta')\n", " genes_species[gene_name] = {}\n", " \n", " for species in my_species:\n", " genes_species[gene_name][species] = dendropy.DnaCharacterMatrix()\n", " for taxon, char_map in char_mat.items():\n", " species = taxon.label.split('_')[0]\n", " if species in my_species:\n", " genes_species[gene_name][species].extend_map({taxon: char_map})" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
seg_sites (RESTV)nuc_div (RESTV)taj_d (RESTV)wat_theta (RESTV)seg_sites (SUDV)nuc_div (SUDV)taj_d (SUDV)wat_theta (SUDV)
NP1130.020659-0.48227549.4890511180.0296301.20352256.64
L2880.018143-0.295386126.1313872820.0241931.412350135.36
VP35420.017099-0.53033018.394161500.0277611.06906124.00
VP40610.026155-0.18813526.715328410.0235171.26916019.68
\n", "
" ], "text/plain": [ " seg_sites (RESTV) nuc_div (RESTV) taj_d (RESTV) wat_theta (RESTV) \\\n", "NP 113 0.020659 -0.482275 49.489051 \n", "L 288 0.018143 -0.295386 126.131387 \n", "VP35 42 0.017099 -0.530330 18.394161 \n", "VP40 61 0.026155 -0.188135 26.715328 \n", "\n", " seg_sites (SUDV) nuc_div (SUDV) taj_d (SUDV) wat_theta (SUDV) \n", "NP 118 0.029630 1.203522 56.64 \n", "L 282 0.024193 1.412350 135.36 \n", "VP35 50 0.027761 1.069061 24.00 \n", "VP40 41 0.023517 1.269160 19.68 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "summary = np.ndarray(shape=(len(genes_species), 4 * len(my_species)))\n", "stats = ['seg_sites', 'nuc_div', 'taj_d', 'wat_theta']\n", "for row, (gene, species_data) in enumerate(genes_species.items()):\n", " for col_base, species in enumerate(my_species):\n", " summary[row, col_base * 4] = popgenstat.num_segregating_sites(species_data[species])\n", " summary[row, col_base * 4 + 1] = popgenstat.nucleotide_diversity(species_data[species])\n", " summary[row, col_base * 4 + 2] = popgenstat.tajimas_d(species_data[species])\n", " summary[row, col_base * 4 + 3] = popgenstat.wattersons_theta(species_data[species])\n", "columns = []\n", "for species in my_species:\n", " columns.extend(['%s (%s)' % (stat, species) for stat in stats])\n", "df = pd.DataFrame(summary, index=genes_species.keys(), columns=columns)\n", "df # vs print(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Genomes" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def do_basic_popgen(seqs):\n", " num_seg_sites = popgenstat.num_segregating_sites(seqs)\n", " avg_pair = popgenstat.average_number_of_pairwise_differences(seqs)\n", " nuc_div = popgenstat.nucleotide_diversity(seqs)\n", " print('Segregating sites: %d, Avg pairwise diffs: %.2f, Nucleotide diversity %.6f' % (num_seg_sites, avg_pair, nuc_div))\n", " print(\"Watterson's theta: %s\" % popgenstat.wattersons_theta(seqs))\n", " print(\"Tajima's D: %s\" % popgenstat.tajimas_d(seqs))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BDBV_KC545393 18728 bp\n", "BDBV_KC545395 18728 bp\n", "BDBV_KC545394 18728 bp\n", "BDBV_KC545396 18728 bp\n", "BDBV_FJ217161 18728 bp\n", "TAFV_FJ217162 18728 bp\n", "EBOV_2014_KM034549 18728 bp\n", "EBOV_2014_KM034550 18728 bp\n", "EBOV_2014_KM034554 18728 bp\n", "EBOV_2014_KM034555 18728 bp\n", "EBOV_2014_KM034556 18728 bp\n", "EBOV_2014_KM034557 18728 bp\n", "EBOV_2014_KM034560 18728 bp\n", "EBOV_2014_KM034551 18728 bp\n", "EBOV_2014_KM034552 18728 bp\n", "EBOV_2014_KM034553 18728 bp\n", "EBOV_2014_KM034558 18728 bp\n", "EBOV_2014_KM034559 18728 bp\n", "EBOV_2014_KM034561 18728 bp\n", "EBOV_2014_KM034562 18728 bp\n", "EBOV_2014_KM034563 18728 bp\n", "EBOV_1976_AF272001 18728 bp\n", "EBOV_1976_KC242801 18728 bp\n", "EBOV_1995_KC242796 18728 bp\n", "EBOV_1995_KC242799 18728 bp\n", "EBOV_2007_KC242784 18728 bp\n", "EBOV_2007_KC242785 18728 bp\n", "EBOV_2007_KC242787 18728 bp\n", "EBOV_2007_KC242786 18728 bp\n", "EBOV_2007_KC242789 18728 bp\n", "EBOV_2007_KC242788 18728 bp\n", "EBOV_2007_KC242790 18728 bp\n", "RESTV_AB050936 18728 bp\n", "RESTV_JX477166 18728 bp\n", "RESTV_FJ621585 18728 bp\n", "RESTV_JX477165 18728 bp\n", "RESTV_FJ621583 18728 bp\n", "RESTV_FJ621584 18728 bp\n", "SUDV_KC242783 18728 bp\n", "SUDV_FJ968794 18728 bp\n", "SUDV_EU338380 18728 bp\n", "SUDV_AY729654 18728 bp\n", "SUDV_JN638998 18728 bp\n", "SUDV_KC589025 18728 bp\n" ] } ], "source": [ "ebov_seqs = dendropy.DnaCharacterMatrix.get_from_path('trim.fasta',\n", " schema='fasta', data_type='dna')\n", "sl_2014 = []\n", "drc_2007 = []\n", "ebov2007_set = dendropy.DnaCharacterMatrix()\n", "ebov2014_set = dendropy.DnaCharacterMatrix()\n", "for taxon, char_map in ebov_seqs.items():\n", " print(taxon.label)\n", " if taxon.label.startswith('EBOV_2014') and len(sl_2014) < 8:\n", " sl_2014.append(char_map)\n", " ebov2014_set.extend_map({taxon: char_map})\n", " elif taxon.label.startswith('EBOV_2007'):\n", " drc_2007.append(char_map)\n", " ebov2007_set.extend_map({taxon: char_map})\n", "del ebov_seqs" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2007 outbreak:\n", "Number of individuals: 7\n", "Segregating sites: 25, Avg pairwise diffs: 7.71, Nucleotide diversity 0.000412\n", "Watterson's theta: 10.2040816327\n", "Tajima's D: -1.38311415748\n", "\n", "2014 outbreak:\n", "Number of individuals: 8\n", "Segregating sites: 6, Avg pairwise diffs: 2.79, Nucleotide diversity 0.000149\n", "Watterson's theta: 2.31404958678\n", "Tajima's D: 0.950120802758\n" ] } ], "source": [ "print('2007 outbreak:')\n", "print('Number of individuals: %s' % len(ebov2007_set.taxon_set))\n", "do_basic_popgen(ebov2007_set)\n", "\n", "print('\\n2014 outbreak:')\n", "print('Number of individuals: %s' % len(ebov2014_set.taxon_set))\n", "do_basic_popgen(ebov2014_set)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8\n", "7\n" ] } ], "source": [ "print(len(sl_2014))\n", "print(len(drc_2007))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pair_stats = popgenstat.PopulationPairSummaryStatistics(sl_2014, drc_2007)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Average number of pairwise differences irrespective of population: 284.46\n", "Average number of pairwise differences between populations: 529.07\n", "Average number of pairwise differences within populations: 10.50\n", "Average number of net pairwise differences : 518.57\n", "Number of segregating sites: 549\n", "Watterson's theta: 168.84\n", "Wakeley's Psi: 0.296\n", "Tajima's D: 3.05\n" ] } ], "source": [ "print('Average number of pairwise differences irrespective of population: %.2f' %\n", " pair_stats.average_number_of_pairwise_differences)\n", "print('Average number of pairwise differences between populations: %.2f' %\n", " pair_stats.average_number_of_pairwise_differences_between)\n", "print('Average number of pairwise differences within populations: %.2f' %\n", " pair_stats.average_number_of_pairwise_differences_within)\n", "print('Average number of net pairwise differences : %.2f' %\n", " pair_stats.average_number_of_pairwise_differences_net)\n", "print('Number of segregating sites: %d' %\n", " pair_stats.num_segregating_sites)\n", "print(\"Watterson's theta: %.2f\" %\n", " pair_stats.wattersons_theta)\n", "print(\"Wakeley's Psi: %.3f\" % pair_stats.wakeleys_psi)\n", "print(\"Tajima's D: %.2f\" % pair_stats.tajimas_d)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }