{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Canarium GBS: BPP analyses\n", "### *Federman et al.*\n", "\n", "This notebook provides all code run BPP analyses performed in Federman et al. (xxxx). All code in this notebook is written in Python." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Required software" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## conda install ipyrad -c ipyrad\n", "## conda install toytree -c eaton-lab\n", "## conda install bpp -c ipyrad" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Imports" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ipyrad v.0.7.22\n" ] } ], "source": [ "import toytree\n", "import toyplot.svg\n", "import ipyrad as ip\n", "import ipyrad.analysis as ipa\n", "print \"ipyrad v.{}\".format(ip.__version__)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Connect to cluster" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "host compute node: [40 cores] on sacra\n" ] } ], "source": [ "import ipyparallel as ipp\n", "ipyclient = ipp.Client()\n", "ip.cluster_info(ipyclient)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup analyses" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## the subset of six taxa used in BPP analyses, 4 per taxon (or 2)\n", "## exclude hybrid taxon SF172\n", "## exclude D12963 b/c it was not consistently placed in 3B vs 3C vs. sister to both \n", "imap = {\n", " \"1A\": ['SF175', 'SF328', 'SF200'],\n", " \"1B\": ['SF209', 'D13052'],\n", " \"1C\": ['D14528', 'SF276', 'SF286'],\n", " \n", " \"2A\": ['D13101', 'D13103', 'D14482', 'D14483'],\n", " \"2B\": ['D14504', 'D14505', 'D14506'],\n", " \"2C\": ['D14477', 'D14478', 'D14480', 'D14485', 'D14501', 'D14513'], \n", " \n", " \"3A\": ['D13090', 'D12950'],\n", " \"3B\": ['SF155', 'SF224', 'SF228', '5573', 'SF327'],\n", " \"3C\": ['SF164', 'SF153', 'SF160', 'D13053', 'D13063', 'D13075', 'D13097', 'SF197'], \n", " }\n", "\n", "\n", "## make a dictionary with min values to filter loci to those with N samples per species.\n", "minmap = {\n", " \"1A\": 3, \n", " \"1B\": 2, \n", " \"1C\": 3,\n", " \"2A\": 4, \n", " \"2B\": 3, \n", " \"2C\": 4,\n", " \"3A\": 2,\n", " \"3B\": 4,\n", " \"3C\": 4,\n", "}" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
3C3B3A2C2B2A1C1B1A
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "## Species tree hypothesis ('guide tree') based on raxml & bucky results\n", "tree9 = \"((1A,(1B,1C)),((2A,(2B,2C)),(3A,(3B,3C))));\"\n", "toytree.tree(tree9).draw(use_edge_lengths=False);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a bpp analysis object" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## initial bpp object\n", "bpp00 = ipa.bpp(\n", " name=\"base\", \n", " data=\"analysis-ipyrad/Canarium-min10_outfiles/Canarium-min10.alleles.loci\",\n", " imap=imap,\n", " minmap=minmap,\n", " guidetree=tree9,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Modify run parameters" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## filtering\n", "bpp00.filters.maxloci = 100\n", "\n", "## bpp params\n", "bpp00.params.burnin = 10000\n", "bpp00.params.nsample = 100000\n", "bpp00.params.sampfreq = 10" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create objects to run algorithm 00 (species tree fitting)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## run object 1\n", "b1 = bpp00.copy(\"bpp00_theta_2_2000_tau_2_2000\")\n", "b1.params.thetaprior = (2, 2000)\n", "b1.params.tauprior = (2, 2000, 1)\n", "\n", "## run object 2\n", "b2 = bpp00.copy(\"bpp00_theta_2_2000_tau_2_200\")\n", "b2.params.thetaprior = (2, 2000)\n", "b2.params.tauprior = (2, 200, 1)\n", "\n", "## run object 3\n", "b3 = bpp00.copy(\"bpp00_theta_2_200_tau_2_2000\")\n", "b3.params.thetaprior = (2, 200)\n", "b3.params.tauprior = (2, 2000, 1)\n", "\n", "## run object 4\n", "b4 = bpp00.copy(\"bpp00_theta_2_200_tau_2_200\")\n", "b4.params.thetaprior = (2, 200)\n", "b4.params.tauprior = (2, 200, 1)\n", "\n", "## put all into a list\n", "b00list = [b1, b2, b3, b4]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Submit species tree jobs to cluster" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "submitted 5 bpp jobs [bpp00_theta_2_2000_tau_2_2000] (100 loci)\n", "submitted 5 bpp jobs [bpp00_theta_2_2000_tau_2_200] (100 loci)\n", "submitted 5 bpp jobs [bpp00_theta_2_200_tau_2_2000] (100 loci)\n", "submitted 5 bpp jobs [bpp00_theta_2_200_tau_2_200] (100 loci)\n" ] } ], "source": [ "## submit 5 replicates of each job\n", "for job in b00list:\n", " job.run(ipyclient, nreps=5, randomize_order=True, force=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Submit prior-only species tree jobs" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## create prior-only run objects\n", "b00plist = [i.copy(i.name+\"-prior-only\") for i in b00list]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## set no-data parameter on them\n", "for job in b00plist:\n", " job.params.usedata = 0" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "submitted 1 bpp jobs [bpp00_theta_2_2000_tau_2_2000-prior-only] (100 loci)\n", "submitted 1 bpp jobs [bpp00_theta_2_2000_tau_2_200-prior-only] (100 loci)\n", "submitted 1 bpp jobs [bpp00_theta_2_200_tau_2_2000-prior-only] (100 loci)\n", "submitted 1 bpp jobs [bpp00_theta_2_200_tau_2_200-prior-only] (100 loci)\n" ] } ], "source": [ "## submit jobs to run (only needs a single rep each)\n", "for job in b00plist:\n", " job.run(ipyclient, randomize_order=True, force=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Submit species delimitation jobs " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## create delim objects as copies of bpp00s\n", "b01list = [i.copy(i.name.replace(\"bpp00\", \"bpp01\")) for i in b00list]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## modify params on the delim objs\n", "for job in b01list:\n", " job.params.infer_delimit = 1\n", " job.params.delimit_alg = (0, 5)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "submitted 5 bpp jobs [bpp01_theta_2_2000_tau_2_2000] (100 loci)\n", "submitted 5 bpp jobs [bpp01_theta_2_2000_tau_2_200] (100 loci)\n", "submitted 5 bpp jobs [bpp01_theta_2_200_tau_2_2000] (100 loci)\n", "submitted 5 bpp jobs [bpp01_theta_2_200_tau_2_200] (100 loci)\n" ] } ], "source": [ "## submit jobs to run\n", "for job in b01list:\n", " job.run(ipyclient, nreps=5, randomize_order=True, force=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Submit prior-only species delimitation jobs " ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## create prior-only run objects\n", "b01plist = [i.copy(i.name+\"-prior-only\") for i in b01list]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## set no-data parameter on them\n", "for job in b01plist:\n", " job.params.usedata = 0" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "submitted 1 bpp jobs [bpp01_theta_2_2000_tau_2_2000-prior-only] (100 loci)\n", "submitted 1 bpp jobs [bpp01_theta_2_2000_tau_2_200-prior-only] (100 loci)\n", "submitted 1 bpp jobs [bpp01_theta_2_200_tau_2_2000-prior-only] (100 loci)\n", "submitted 1 bpp jobs [bpp01_theta_2_200_tau_2_200-prior-only] (100 loci)\n" ] } ], "source": [ "## submit jobs to run (only needs a single rep each)\n", "for job in b01plist:\n", " job.run(ipyclient, randomize_order=True, force=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----------------------------------------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Summarize results\n", "Here I access the result files from th bpp objects themselves. Alternatively you could enter the path to the results files. " ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## set options to print prettier tables\n", "## (this suppresses scientific notation)\n", "import pandas as pd\n", "import numpy as np\n", "pd.set_option('display.float_format', lambda x: '%.4f' % x)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Fixed species tree parameter inference (algorithm 00)\n", "\n", "The first question is whether the replicate runs for the same prior values have converged on similar parameter values. If so, then we should feel OK with combining them and reporting average parameter values as our results. " ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## get mean parameters across different values for the prior with and without data\n", "all00 = pd.DataFrame(\n", " {job.name: job.summarize_results()[\"mean\"] for job in b00list + b00plist})" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
with-dataprior-only
lnL-11006.5848nan
tau_101A1B1C2A2B2C3A3B3C0.00110.0014
tau_111A1B1C0.00070.0009
tau_121B1C0.00040.0005
tau_132A2B2C3A3B3C0.00100.0012
tau_142A2B2C0.00090.0008
tau_152B2C0.00070.0004
tau_163A3B3C0.00070.0008
tau_173B3C0.00040.0004
theta_101A1B1C2A2B2C3A3B3C0.00200.0090
theta_111A1B1C0.00330.0096
theta_11A0.00270.0101
theta_121B1C0.00940.0101
theta_132A2B2C3A3B3C0.00980.0095
theta_142A2B2C0.00740.0100
theta_152B2C0.00990.0103
theta_163A3B3C0.00470.0095
theta_173B3C0.00570.0091
theta_21B0.00110.0099
theta_31C0.00160.0095
theta_42A0.00300.0093
theta_52B0.00120.0100
theta_62C0.01190.0104
theta_73A0.00130.0101
theta_83B0.00840.0102
theta_93C0.00360.0097
\n", "
" ], "text/plain": [ " with-data prior-only\n", "lnL -11006.5848 nan\n", "tau_101A1B1C2A2B2C3A3B3C 0.0011 0.0014\n", "tau_111A1B1C 0.0007 0.0009\n", "tau_121B1C 0.0004 0.0005\n", "tau_132A2B2C3A3B3C 0.0010 0.0012\n", "tau_142A2B2C 0.0009 0.0008\n", "tau_152B2C 0.0007 0.0004\n", "tau_163A3B3C 0.0007 0.0008\n", "tau_173B3C 0.0004 0.0004\n", "theta_101A1B1C2A2B2C3A3B3C 0.0020 0.0090\n", "theta_111A1B1C 0.0033 0.0096\n", "theta_11A 0.0027 0.0101\n", "theta_121B1C 0.0094 0.0101\n", "theta_132A2B2C3A3B3C 0.0098 0.0095\n", "theta_142A2B2C 0.0074 0.0100\n", "theta_152B2C 0.0099 0.0103\n", "theta_163A3B3C 0.0047 0.0095\n", "theta_173B3C 0.0057 0.0091\n", "theta_21B 0.0011 0.0099\n", "theta_31C 0.0016 0.0095\n", "theta_42A 0.0030 0.0093\n", "theta_52B 0.0012 0.0100\n", "theta_62C 0.0119 0.0104\n", "theta_73A 0.0013 0.0101\n", "theta_83B 0.0084 0.0102\n", "theta_93C 0.0036 0.0097" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## save result to file for 200 2000 \n", "outdf = all00[[\"bpp00_theta_2_200_tau_2_2000\", \"bpp00_theta_2_200_tau_2_2000-prior-only\"]]\n", "outdf.columns = [\"with-data\", \"prior-only\"]\n", "outdf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Summarize BPP 01 results (delimitation) \n", "These tables show the mean posteriors the five replicates for each set of priors values. " ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bpp01_theta_2_2000_tau_2_2000\n", " delim prior posterior nspecies\n", "0 00000000 0.03226 0.0 1\n", "1 10000000 0.03226 0.0 2\n", "2 10010000 0.03226 0.0 3\n", "3 10010010 0.03226 0.0 4\n", "4 10010011 0.03226 0.0 5\n", "5 10011000 0.03226 0.0 4\n", "6 10011010 0.03226 0.0 5\n", "7 10011011 0.03226 0.0 6\n", "8 10011100 0.03226 0.0 5\n", "9 10011110 0.03226 0.0 6\n", "10 10011111 0.03226 0.007156 7\n", "11 11000000 0.03226 0.0 3\n", "12 11010000 0.03226 0.0 4\n", "13 11010010 0.03226 0.0 5\n", "14 11010011 0.03226 0.0 6\n", "15 11011000 0.03226 0.0 5\n", "16 11011010 0.03226 0.0 6\n", "17 11011011 0.03226 0.0 7\n", "18 11011100 0.03226 0.0 6\n", "19 11011110 0.03226 0.0 7\n", "20 11011111 0.03226 0.060192 8\n", "21 11100000 0.03226 0.0 4\n", "22 11110000 0.03226 0.0 5\n", "23 11110010 0.03226 0.0 6\n", "24 11110011 0.03226 0.0 7\n", "25 11111000 0.03226 0.0 6\n", "26 11111010 0.03226 0.0 7\n", "27 11111011 0.03226 0.0 8\n", "28 11111100 0.03226 0.0 7\n", "29 11111110 0.03226 0.0 8\n", "30 11111111 0.03226 0.932652 9\n", "\n", "bpp01_theta_2_2000_tau_2_200\n", " delim prior posterior nspecies\n", "0 00000000 0.03226 0.0 1\n", "1 10000000 0.03226 0.0 2\n", "2 10010000 0.03226 0.0 3\n", "3 10010010 0.03226 0.0 4\n", "4 10010011 0.03226 0.0 5\n", "5 10011000 0.03226 0.0 4\n", "6 10011010 0.03226 0.0 5\n", "7 10011011 0.03226 0.0 6\n", "8 10011100 0.03226 0.0 5\n", "9 10011110 0.03226 0.0 6\n", "10 10011111 0.03226 0.020182 7\n", "11 11000000 0.03226 0.0 3\n", "12 11010000 0.03226 0.0 4\n", "13 11010010 0.03226 0.0 5\n", "14 11010011 0.03226 0.0 6\n", "15 11011000 0.03226 0.0 5\n", "16 11011010 0.03226 0.0 6\n", "17 11011011 0.03226 0.0 7\n", "18 11011100 0.03226 0.0 6\n", "19 11011110 0.03226 0.0 7\n", "20 11011111 0.03226 0.07754 8\n", "21 11100000 0.03226 0.0 4\n", "22 11110000 0.03226 0.0 5\n", "23 11110010 0.03226 0.0 6\n", "24 11110011 0.03226 0.0 7\n", "25 11111000 0.03226 0.0 6\n", "26 11111010 0.03226 0.0 7\n", "27 11111011 0.03226 0.0 8\n", "28 11111100 0.03226 0.0 7\n", "29 11111110 0.03226 0.0 8\n", "30 11111111 0.03226 0.902278 9\n", "\n", "bpp01_theta_2_200_tau_2_2000\n", " delim prior posterior nspecies\n", "0 00000000 0.03226 0.0 1\n", "1 10000000 0.03226 0.0 2\n", "2 10010000 0.03226 0.0 3\n", "3 10010010 0.03226 0.0 4\n", "4 10010011 0.03226 0.0 5\n", "5 10011000 0.03226 0.0 4\n", "6 10011010 0.03226 0.0 5\n", "7 10011011 0.03226 0.0 6\n", "8 10011100 0.03226 0.0 5\n", "9 10011110 0.03226 0.0 6\n", "10 10011111 0.03226 0.012494 7\n", "11 11000000 0.03226 0.0 3\n", "12 11010000 0.03226 0.0 4\n", "13 11010010 0.03226 0.0 5\n", "14 11010011 0.03226 0.0 6\n", "15 11011000 0.03226 0.0 5\n", "16 11011010 0.03226 0.0 6\n", "17 11011011 0.03226 0.0 7\n", "18 11011100 0.03226 0.0 6\n", "19 11011110 0.03226 0.0 7\n", "20 11011111 0.03226 0.02224 8\n", "21 11100000 0.03226 0.0 4\n", "22 11110000 0.03226 0.0 5\n", "23 11110010 0.03226 0.0 6\n", "24 11110011 0.03226 0.0 7\n", "25 11111000 0.03226 0.0 6\n", "26 11111010 0.03226 0.0 7\n", "27 11111011 0.03226 0.0 8\n", "28 11111100 0.03226 0.0 7\n", "29 11111110 0.03226 0.0 8\n", "30 11111111 0.03226 0.965266 9\n", "\n", "bpp01_theta_2_200_tau_2_200\n", " delim prior posterior nspecies\n", "0 00000000 0.03226 0.0 1\n", "1 10000000 0.03226 0.0 2\n", "2 10010000 0.03226 0.0 3\n", "3 10010010 0.03226 0.0 4\n", "4 10010011 0.03226 0.0 5\n", "5 10011000 0.03226 0.0 4\n", "6 10011010 0.03226 0.0 5\n", "7 10011011 0.03226 0.0 6\n", "8 10011100 0.03226 0.0 5\n", "9 10011110 0.03226 0.0 6\n", "10 10011111 0.03226 0.000128 7\n", "11 11000000 0.03226 0.0 3\n", "12 11010000 0.03226 0.0 4\n", "13 11010010 0.03226 0.0 5\n", "14 11010011 0.03226 0.0 6\n", "15 11011000 0.03226 0.0 5\n", "16 11011010 0.03226 0.0 6\n", "17 11011011 0.03226 0.0 7\n", "18 11011100 0.03226 0.0 6\n", "19 11011110 0.03226 0.0 7\n", "20 11011111 0.03226 0.078476 8\n", "21 11100000 0.03226 0.0 4\n", "22 11110000 0.03226 0.0 5\n", "23 11110010 0.03226 0.0 6\n", "24 11110011 0.03226 0.0 7\n", "25 11111000 0.03226 0.0 6\n", "26 11111010 0.03226 0.0 7\n", "27 11111011 0.03226 0.0 8\n", "28 11111100 0.03226 0.0 7\n", "29 11111110 0.03226 0.0 8\n", "30 11111111 0.03226 0.921396 9\n", "\n" ] } ], "source": [ "for job in b01list:\n", " print job.name\n", " print job.summarize_results()\n", " print \"\"" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.13" } }, "nbformat": 4, "nbformat_minor": 1 }