\n", "
• \n", "
• \n", " Save as .csv\n", "
• \n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "## parse the newick tree, re-root it, and plot it.\n", "tre = toytree.tree(newick=newick)\n", "tre.root(wildcard=\"prz\")\n", "tre.draw(node_labels=True, node_size=10);\n", "\n", "## store rooted tree back into a newick string.\n", "newick = tre.tree.write()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Short tutorial: calculating abba-baba statistics\n", "To give a gist of what this code can do, here is a quick tutorial version, each step of which we explain in greater detail below. We first create a `'baba'` analysis object that is linked to our data file, in this example we name the variable **bb**. Then we tell it which tests to perform, here by automatically generating a number of tests using the `generate_tests_from_tree()` function. And finally, we calculate the results and plot them. " ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "44 tests generated from tree\n", "[####################] 100% calculating D-stats | 0:02:35 | \n" ] } ], "source": [ "## create a baba object linked to a data file and newick tree\n", "bb = ipa.baba(data=locifile, newick=newick)\n", "\n", "## generate all possible abba-baba tests meeting a set of constraints\n", "bb.generate_tests_from_tree(\n", " constraint_dict={\n", " \"p4\": [\"32082_przewalskii\", \"33588_przewalskii\"],\n", " \"p3\": [\"33413_thamno\"],\n", " })\n", "\n", "## run all tests linked to bb \n", "bb.run(ipyclient)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Look at the results" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dstatbootmeanbootstdZABBABABAnloci
0-0.099-0.0970.0432.311258.844315.4386718
1-0.114-0.1150.0392.886303.938381.9068221
2-0.089-0.0900.0422.131258.750309.5006694
3-0.101-0.1020.0402.544302.188370.1888173
4-0.098-0.0970.0422.331238.688290.7506347
5-0.122-0.1230.0403.079279.375356.8127725
60.0680.0680.0361.907408.969356.7198899
70.0800.0820.0372.178390.312332.3128390
80.0890.0910.0442.022309.500258.7506694
90.0980.0980.0452.173290.750238.6886347
100.1010.1000.0392.610370.188302.1888173
110.1220.1220.0393.112356.812279.3757725
120.1290.1280.0314.129474.500366.0319390
130.1420.1440.0354.073457.250343.6259151
140.1190.1180.0333.554434.500342.0629048
150.2180.2170.0287.666560.750359.9589455
160.2780.2770.0328.650567.500320.7509113
170.1780.1770.0335.322501.938350.5009211
180.1600.1590.0364.387403.750292.4387909
190.1760.1770.0355.043491.625344.3129105
200.1640.1630.0334.987477.984343.2669526
210.0570.0570.0331.737438.625391.4699671
220.0340.0340.0360.939357.750334.0948268
230.0500.0510.0331.528418.812379.1569533
240.1730.1730.0345.070436.375307.6259282
250.0440.0430.0351.263401.969368.0319394
260.0140.0150.0380.373324.750315.6888051
270.0390.0400.0351.114389.625360.0629272
280.1880.1890.0365.257460.688314.8759172
290.0800.0790.0362.252420.906358.4699292
300.0740.0740.0372.003349.438301.2507964
310.0670.0660.0361.866395.875346.4389171
32-0.086-0.0870.0302.923373.188443.7089652
33-0.118-0.1180.0323.738367.552465.9279537
34-0.173-0.1740.0345.129307.625436.3759282
35-0.188-0.1900.0355.326314.875460.6889172
36-0.044-0.0450.0351.263368.031401.9699394
37-0.080-0.0820.0372.182358.469420.9069292
38-0.014-0.0140.0370.382315.688324.7508051
39-0.074-0.0770.0401.854301.250349.4387964
40-0.039-0.0390.0351.114360.062389.6259272
41-0.067-0.0670.0361.858346.438395.8759171
42-0.130-0.1310.0403.215296.938385.7508044
43-0.123-0.1240.0363.434338.938434.3759235
\n", "
" ], "text/plain": [ " dstat bootmean bootstd Z ABBA BABA nloci\n", "0 -0.099 -0.097 0.043 2.311 258.844 315.438 6718\n", "1 -0.114 -0.115 0.039 2.886 303.938 381.906 8221\n", "2 -0.089 -0.090 0.042 2.131 258.750 309.500 6694\n", "3 -0.101 -0.102 0.040 2.544 302.188 370.188 8173\n", "4 -0.098 -0.097 0.042 2.331 238.688 290.750 6347\n", "5 -0.122 -0.123 0.040 3.079 279.375 356.812 7725\n", "6 0.068 0.068 0.036 1.907 408.969 356.719 8899\n", "7 0.080 0.082 0.037 2.178 390.312 332.312 8390\n", "8 0.089 0.091 0.044 2.022 309.500 258.750 6694\n", "9 0.098 0.098 0.045 2.173 290.750 238.688 6347\n", "10 0.101 0.100 0.039 2.610 370.188 302.188 8173\n", "11 0.122 0.122 0.039 3.112 356.812 279.375 7725\n", "12 0.129 0.128 0.031 4.129 474.500 366.031 9390\n", "13 0.142 0.144 0.035 4.073 457.250 343.625 9151\n", "14 0.119 0.118 0.033 3.554 434.500 342.062 9048\n", "15 0.218 0.217 0.028 7.666 560.750 359.958 9455\n", "16 0.278 0.277 0.032 8.650 567.500 320.750 9113\n", "17 0.178 0.177 0.033 5.322 501.938 350.500 9211\n", "18 0.160 0.159 0.036 4.387 403.750 292.438 7909\n", "19 0.176 0.177 0.035 5.043 491.625 344.312 9105\n", "20 0.164 0.163 0.033 4.987 477.984 343.266 9526\n", "21 0.057 0.057 0.033 1.737 438.625 391.469 9671\n", "22 0.034 0.034 0.036 0.939 357.750 334.094 8268\n", "23 0.050 0.051 0.033 1.528 418.812 379.156 9533\n", "24 0.173 0.173 0.034 5.070 436.375 307.625 9282\n", "25 0.044 0.043 0.035 1.263 401.969 368.031 9394\n", "26 0.014 0.015 0.038 0.373 324.750 315.688 8051\n", "27 0.039 0.040 0.035 1.114 389.625 360.062 9272\n", "28 0.188 0.189 0.036 5.257 460.688 314.875 9172\n", "29 0.080 0.079 0.036 2.252 420.906 358.469 9292\n", "30 0.074 0.074 0.037 2.003 349.438 301.250 7964\n", "31 0.067 0.066 0.036 1.866 395.875 346.438 9171\n", "32 -0.086 -0.087 0.030 2.923 373.188 443.708 9652\n", "33 -0.118 -0.118 0.032 3.738 367.552 465.927 9537\n", "34 -0.173 -0.174 0.034 5.129 307.625 436.375 9282\n", "35 -0.188 -0.190 0.035 5.326 314.875 460.688 9172\n", "36 -0.044 -0.045 0.035 1.263 368.031 401.969 9394\n", "37 -0.080 -0.082 0.037 2.182 358.469 420.906 9292\n", "38 -0.014 -0.014 0.037 0.382 315.688 324.750 8051\n", "39 -0.074 -0.077 0.040 1.854 301.250 349.438 7964\n", "40 -0.039 -0.039 0.035 1.114 360.062 389.625 9272\n", "41 -0.067 -0.067 0.036 1.858 346.438 395.875 9171\n", "42 -0.130 -0.131 0.040 3.215 296.938 385.750 8044\n", "43 -0.123 -0.124 0.036 3.434 338.938 434.375 9235" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## save the results table to a csv file\n", "bb.results_table.to_csv(\"bb.abba-baba.csv\", sep=\"\\t\")\n", "\n", "## show the results table in notebook\n", "bb.results_table" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Plotting and interpreting results\n", "Interpreting the results of D-statistic tests is actually *very* complicated. You cannot treat every test as if it were independent because introgression between one pair of species may cause one or both of those species to *appear* as if they have also introgressed with other taxa in your data set. This problem is described in great detail in [this paper (Eaton et al. 2015)](http://onlinelibrary.wiley.com/doi/10.1111/evo.12758/abstract). A good place to start, then, is to perform many tests and focus on those which have the strongest signal of admixture. Then, perform additional tests, such as `partitioned D-statistics` (described further below) to tease apart whether a single or multiple introgression events are likely to have occurred. \n", "\n", "In the example plot below we find evidence of admixture between the sample **33413_thamno** (black) with several other samples, but the signal is strongest with respect to **30556_thamno** (test 33). It also appears that admixture is consistently detected with samples of (**40578_rex** & **35855_rex**) when contrasted against **35236_rex** (tests 14, 20, 29, 32)." ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "
• \n", "
• \n", " Save as .csv\n", "
• \n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dstatbootmeanbootstdZABBABABAnloci
0-0.022-0.0190.0480.466225.875236.1888439
10.0200.0190.0450.432245.000235.6258993
\n", "
" ], "text/plain": [ " dstat bootmean bootstd Z ABBA BABA nloci\n", "0 -0.022 -0.019 0.048 0.466 225.875 236.188 8439\n", "1 0.020 0.019 0.045 0.432 245.000 235.625 8993" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## you can sort the results by Z-score\n", "cc.results_table.sort_values(by=\"Z\", ascending=False)\n", "\n", "## save the table to a file \n", "cc.results_table.to_csv(\"cc.abba-baba.csv\")\n", "\n", "## show the results in notebook\n", "cc.results_table" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Auto-generating tests\n", "Entering all of the tests by hand can be pain, which is why we wrote functions to auto-generate tests given an input **rooted** tree, and a number of contraints on the tests to generate from that tree. It is important to add constraints on the tests otherwise the number that can be produced becomes very large very quickly. Calculating results runs pretty fast, but summarizing and interpreting thousands of results is pretty much impossible, so it is generally better to limit the tests to those which make some intuitive sense to run. You can see in this example that implementing a few contraints reduces the number of tests from 1608 to 13. " ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1608 tests generated from tree\n", "117 tests generated from tree\n", "13 tests generated from tree\n" ] } ], "source": [ "## create a new 'copy' of your baba object and attach a treefile\n", "dd = bb.copy()\n", "dd.newick = newick\n", "\n", "## generate all possible tests\n", "dd.generate_tests_from_tree()\n", "\n", "## a dict of constraints\n", "constraint_dict={\n", " \"p4\": [\"32082_przewalskii\", \"33588_przewalskii\"],\n", " \"p3\": [\"40578_rex\", \"35855_rex\"],\n", " }\n", "\n", "## generate tests with contraints\n", "dd.generate_tests_from_tree(\n", " constraint_dict=constraint_dict, \n", " constraint_exact=False,\n", ")\n", "\n", "## 'exact' contrainst are even more constrained\n", "dd.generate_tests_from_tree(\n", " constraint_dict=constraint_dict,\n", " constraint_exact=True,\n", ")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% calculating D-stats | 0:00:12 | \n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dstatbootmeanbootstdZABBABABAnloci
0-0.107-0.1070.0353.095348.016431.8289298
1-0.130-0.1280.0403.290295.312383.9538537
2-0.111-0.1130.0412.725266.414333.0866975
3-0.138-0.1380.0353.970313.633413.9308715
4-0.157-0.1560.0413.865268.891369.1728010
5-0.141-0.1410.0413.459239.750318.2506597
6-0.105-0.1040.0343.045343.203423.3129275
7-0.129-0.1260.0403.257291.266377.6418517
8-0.109-0.1110.0412.694262.266326.6726961
9-0.021-0.0210.0520.397171.125178.4066468
100.1410.1420.0552.56939.23429.5169218
110.0140.0160.0300.462564.281548.6259456
12-0.036-0.0380.0590.607104.875112.6568480
\n", "
" ], "text/plain": [ " dstat bootmean bootstd Z ABBA BABA nloci\n", "0 -0.107 -0.107 0.035 3.095 348.016 431.828 9298\n", "1 -0.130 -0.128 0.040 3.290 295.312 383.953 8537\n", "2 -0.111 -0.113 0.041 2.725 266.414 333.086 6975\n", "3 -0.138 -0.138 0.035 3.970 313.633 413.930 8715\n", "4 -0.157 -0.156 0.041 3.865 268.891 369.172 8010\n", "5 -0.141 -0.141 0.041 3.459 239.750 318.250 6597\n", "6 -0.105 -0.104 0.034 3.045 343.203 423.312 9275\n", "7 -0.129 -0.126 0.040 3.257 291.266 377.641 8517\n", "8 -0.109 -0.111 0.041 2.694 262.266 326.672 6961\n", "9 -0.021 -0.021 0.052 0.397 171.125 178.406 6468\n", "10 0.141 0.142 0.055 2.569 39.234 29.516 9218\n", "11 0.014 0.016 0.030 0.462 564.281 548.625 9456\n", "12 -0.036 -0.038 0.059 0.607 104.875 112.656 8480" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ "
\n", "
• \n", "
• \n", " Save as .csv\n", "
• \n", "