{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# PE ddRAD data analysis with denovo and reference based assembly\n", "Data from \"Single-nucleotide polymorphism discovery and panel characterization in the African forest elephant\"\n", "\n", "* http://onlinelibrary.wiley.com/doi/10.1002/ece3.3854/full?wol1URL=/doi/10.1002/ece3.3854/full®ionCode=US-NY&identityKey=34b23ec9-4666-4c37-8470-8448e64d6167\n", "\n", "## Other PE ddRAD datasets that could have been useful\n", "Ptychadena\n", "* http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0190440#sec022\n", "\n", "Peromyscus (Munshi-South)\n", "\n", "Chinese sea bass (SRP094869)\n", "* Population Genomics Reveals Genetic Divergence and Adaptive Differentiation of Chinese Sea Bass (Lateolabrax maculatus)\n", "* https://link.springer.com/article/10.1007/s10126-017-9786-0#Sec2" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import ipyrad as ip\n", "import ipyrad.analysis as ipa\n", "import ipyparallel as ipp\n", "import pandas as pd\n", "import toyplot\n", "import glob\n", "import gzip\n", "\n", "\n", "## Ptychadena (Stephane)\n", "## * http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0190440#sec022\n", "\n", "## Peromyscus (Munshi-South)\n", "\n", "## Chinese sea bass (SRP094869)\n", "## Population Genomics Reveals Genetic Divergence and Adaptive Differentiation of Chinese Sea Bass (Lateolabrax maculatus)\n", "## * https://link.springer.com/article/10.1007/s10126-017-9786-0#Sec2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "conda install -c eaton-lab toytree\n", "conda install -c bioconda sra-tools entrez-direct" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "## Fetch the elephant reference genome\n", "mkdir ref\n", "wget ftp://ftp.broadinstitute.org/pub/assemblies/mammals/elephant/loxAfr3/assembly_supers.fasta.gz -o ref/assembly_supers.fasta.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "\n", "## Data from \"Single-nucleotide polymorphism discovery and panel characterization in the African forest elephant\"\n", "## http://onlinelibrary.wiley.com/doi/10.1002/ece3.3854/full?wol1URL=/doi/10.1002/ece3.3854/full®ionCode=US-NY&identityKey=34b23ec9-4666-4c37-8470-8448e64d6167\n", "ipyrad --download SRP126637 raws/" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['raws/LOC0310_SRR6371511.fastq.gz', 'raws/LOC0088_SRR6371521.fastq.gz', 'raws/LOC0311_SRR6371510.fastq.gz', 'raws/LOC0051_SRR6371516.fastq.gz', 'raws/LOC0037_SRR6371514.fastq.gz', 'raws/LOC0038_SRR6371513.fastq.gz', 'raws/LOC0040_SRR6371512.fastq.gz', 'raws/LOC0041_SRR6371519.fastq.gz', 'raws/LOC0050_SRR6371517.fastq.gz', 'raws/LOC0309_SRR6371508.fastq.gz', 'raws/LOC0151_SRR6371505.fastq.gz', 'raws/LOC0127_SRR6371502.fastq.gz', 'raws/LOC0279_SRR6371509.fastq.gz', 'raws/LOC0274_SRR6371506.fastq.gz', 'raws/LOC0049_SRR6371518.fastq.gz', 'raws/LOC0035_SRR6371515.fastq.gz', 'raws/LOC0121_SRR6371520.fastq.gz', 'raws/LOC0263_SRR6371507.fastq.gz', 'raws/LOC0122_SRR6371503.fastq.gz']\n", "LOC0310 535328\n", "LOC0088 2532764\n", "LOC0311 2381720\n", "LOC0051 2267296\n", "LOC0037 4639748\n", "LOC0038 4352988\n", "LOC0040 1516120\n", "LOC0041 2733056\n", "LOC0050 3633896\n", "LOC0309 2004280\n", "LOC0151 2005558\n", "LOC0127 482392\n", "LOC0279 5789736\n", "LOC0274 3165976\n", "LOC0049 2942484\n", "LOC0035 1812120\n", "LOC0121 450136\n", "LOC0263 5038456\n", "LOC0122 2266816\n" ] } ], "source": [ "## R1 and R2 were concatenated in the SRA data files so we have to pull them apart.\n", "## This is probably not the most efficient way to do this, but it works.\n", "raws = glob.glob(\"raws/*\")\n", "print(raws)\n", "for r in raws:\n", " name = r.split(\"/\")[1].split(\"_\")[0]\n", " lines = gzip.open(r).readlines()\n", " nlines = len(lines)/2\n", " print(\"{} {}\".format(name, nlines))\n", " with gzip.open(\"raws/{}_R1_.fastq.gz\".format(name), 'wb') as r1:\n", " r1.write(\"\".join(lines[:nlines]))\n", " with gzip.open(\"raws/{}_R2_.fastq.gz\".format(name), 'wb') as r2:\n", " r2.write(\"\".join(lines[nlines:]))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "Fetching project data...\n", " Run spots mates ScientificName SampleName\n", "0 SRR6371502 241196 0 Loxodonta cyclotis LOC0127\n", "1 SRR6371503 1133408 0 Loxodonta cyclotis LOC0122\n", "2 SRR6371505 1002779 0 Loxodonta cyclotis LOC0151\n", "3 SRR6371506 1582988 0 Loxodonta cyclotis LOC0274\n", "4 SRR6371507 2519228 0 Loxodonta cyclotis LOC0263\n", "5 SRR6371508 1002140 0 Loxodonta cyclotis LOC0309\n", "6 SRR6371509 2894868 0 Loxodonta cyclotis LOC0279\n", "7 SRR6371510 1190860 0 Loxodonta cyclotis LOC0311\n", "8 SRR6371511 267664 0 Loxodonta cyclotis LOC0310\n", "9 SRR6371512 758060 0 Loxodonta cyclotis LOC0040\n", "10 SRR6371513 2176494 0 Loxodonta cyclotis LOC0038\n", "11 SRR6371514 2319874 0 Loxodonta cyclotis LOC0037\n", "12 SRR6371515 906060 0 Loxodonta cyclotis LOC0035\n", "13 SRR6371516 1133648 0 Loxodonta cyclotis LOC0051\n", "14 SRR6371517 1816948 0 Loxodonta cyclotis LOC0050\n", "15 SRR6371518 1471242 0 Loxodonta cyclotis LOC0049\n", "16 SRR6371519 1366528 0 Loxodonta cyclotis LOC0041\n", "17 SRR6371520 225068 0 Loxodonta cyclotis LOC0121\n", "18 SRR6371521 1266382 0 Loxodonta cyclotis LOC0088\n", "```" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0 7 8\n", "0 SRR6371521 LOC0088_a Lope\n", "1 SRR6371520 LOC0121_a Minkebe\n", "2 SRR6371519 LOC0041_a Waka\n", "3 SRR6371518 LOC0049_a Ivindo\n", "4 SRR6371517 LOC0050_b Ivindo\n", "5 SRR6371516 LOC0051_a Ivindo\n", "6 SRR6371515 LOC0035_a Minkebe\n", "7 SRR6371514 LOC0037_a Lope\n", "8 SRR6371513 LOC0038_a Lope\n", "9 SRR6371512 LOC0040_a Wonga Wongue\n", "10 SRR6371511 LOC0310_a Moukalaba Doudou\n", "11 SRR6371510 LOC0311_a Monts de Cristal\n", "12 SRR6371509 LOC0279_b South Mulundu\n", "13 SRR6371508 LOC0309_a Mayumba\n", "14 SRR6371507 LOC0263_a Wonga Wongue\n", "15 SRR6371506 LOC0274_a Loango\n", "16 SRR6371505 LOC0151_a Moukalaba Doudou\n", "17 SRR6371503 LOC0122_a Minkebe\n", "18 SRR6371502 LOC0127_a Moukalaba Doudou\n" ] } ], "source": [ "df = pd.DataFrame([x.split(\"\\t\") for x in \"\"\"SRR6371521\tSAMN08167496\tLOC0088\tWG011\t178\t95\tSRX3466825\tLOC0088_a\tLope\n", "SRR6371520\tSAMN08167497\tLOC0121\tWG012\t31\t16\tSRX3466826\tLOC0121_a\tMinkebe\n", "SRR6371519\tSAMN08167492\tLOC0041\tWG005\t192\t102\tSRX3466827\tLOC0041_a\tWaka\n", "SRR6371518\tSAMN08167493\tLOC0049\tWG008\t207\t110\tSRX3466828\tLOC0049_a\tIvindo\n", "SRR6371517\tSAMN08167494\tLOC0050\tWG009\t256\t135\tSRX3466829\tLOC0050_b\tIvindo\n", "SRR6371516\tSAMN08167495\tLOC0051\tWG010\t160\t85\tSRX3466830\tLOC0051_a\tIvindo\n", "SRR6371515\tSAMN08167488\tLOC0035\tWG001\t127\t69\tSRX3466831\tLOC0035_a\tMinkebe\n", "SRR6371514\tSAMN08167489\tLOC0037\tWG002\t327\t175\tSRX3466832\tLOC0037_a\tLope\n", "SRR6371513\tSAMN08167490\tLOC0038\tWG003\t307\t164\tSRX3466833\tLOC0038_a\tLope\n", "SRR6371512\tSAMN08167491\tLOC0040\tWG004\t106\t56\tSRX3466834\tLOC0040_a\tWonga Wongue\n", "SRR6371511\tSAMN08167506\tLOC0310\tWG023\t37\t20\tSRX3466835\tLOC0310_a\tMoukalaba Doudou\n", "SRR6371510\tSAMN08167507\tLOC0311\tWG024\t168\t89\tSRX3466836\tLOC0311_a\tMonts de Cristal\n", "SRR6371509\tSAMN08167504\tLOC0279\tWG020\t408\t215\tSRX3466837\tLOC0279_b\tSouth Mulundu\n", "SRR6371508\tSAMN08167505\tLOC0309\tWG022\t141\t75\tSRX3466838\tLOC0309_a\tMayumba\n", "SRR6371507\tSAMN08167502\tLOC0263\tWG018\t355\t188\tSRX3466839\tLOC0263_a\tWonga Wongue\n", "SRR6371506\tSAMN08167503\tLOC0274\tWG019\t223\t117\tSRX3466840\tLOC0274_a\tLoango\n", "SRR6371505\tSAMN08167500\tLOC0151\tWG015\t141\t71\tSRX3466841\tLOC0151_a\tMoukalaba Doudou\n", "SRR6371503\tSAMN08167498\tLOC0122\tWG013\t159\t85\tSRX3466843\tLOC0122_a\tMinkebe\n", "SRR6371502\tSAMN08167499\tLOC0127\tWG014\t34\t18\tSRX3466844\tLOC0127_a\tMoukalaba Doudou\"\"\".split(\"\\n\")])\n", "\n", "## Just get the sequence identifier, the sample name, and the sample location\n", "df = df.iloc[:, [0,7,8]]\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Launch the cluster by running this command on the command line:\n", "\n", " ipcluster start --n=19 --daemonize" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## connect to cluster and check if all the engines are running\n", "ipyclient = ipp.Client()\n", "ipyclient.ids" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New Assembly: Loxodonta\n" ] } ], "source": [ "data = ip.Assembly(\"Loxodonta\")" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 assembly_name Loxodonta \n", "1 project_dir ./ddrad-denovo \n", "2 raw_fastq_path \n", "3 barcodes_path \n", "4 sorted_fastq_path ./raws/*_.fastq \n", "5 assembly_method denovo \n", "6 reference_sequence \n", "7 datatype pairddrad \n", "8 restriction_overhang ('TGCAG', 'CATGC') \n", "9 max_low_qual_bases 5 \n", "10 phred_Qscore_offset 33 \n", "11 mindepth_statistical 6 \n", "12 mindepth_majrule 6 \n", "13 maxdepth 10000 \n", "14 clust_threshold 0.9 \n", "15 max_barcode_mismatch 0 \n", "16 filter_adapters 2 \n", "17 filter_min_trim_len 35 \n", "18 max_alleles_consens 2 \n", "19 max_Ns_consens (5, 5) \n", "20 max_Hs_consens (5, 5) \n", "21 min_samples_locus 4 \n", "22 max_SNPs_locus (20, 20) \n", "23 max_Indels_locus (8, 8) \n", "24 max_shared_Hs_locus 0.5 \n", "25 trim_reads (0, 0, 0, 0) \n", "26 trim_loci (0, 0, 0, 0) \n", "27 output_formats ['G', 'a', 'g', 'k', 'm', 'l', 'n', 'p', 's', 'u', 't', 'v']\n", "28 pop_assign_file \n" ] } ], "source": [ "## set parameters\n", "data.set_params(\"project_dir\", \"ddrad-denovo\")\n", "data.set_params(\"sorted_fastq_path\", \"raws/*_.fastq\")\n", "data.set_params(\"datatype\", \"pairddrad\")\n", "data.set_params(\"restriction_overhang\", (\"TGCAG\", \"CATGC\"))\n", "data.set_params(\"clust_threshold\", \"0.90\")\n", "data.set_params(\"filter_adapters\", \"2\")\n", "data.set_params(\"max_Hs_consens\", (5, 5))\n", "data.set_params(\"trim_loci\", (0, 0, 0, 0))\n", "data.set_params(\"output_formats\", \"*\")\n", "\n", "## see/print all parameters\n", "data.get_params()\n", "data.write_params(force=True)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: Loxodonta\n", " Skipping: 19 Samples already found in Assembly Loxodonta.\n", " (can overwrite with force argument) \n", "[####################] 100% processing reads | 0:00:02 | s2 | \n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n" ] } ], "source": [ "data.run(\"12\")" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statereads_rawreads_passed_filter
LOC0035_SRR63715152906060905955
LOC0037_SRR6371514223198742319590
LOC0038_SRR6371513221764942176182
LOC0040_SRR63715122758060757942
LOC0041_SRR6371519213665281366341
LOC0049_SRR6371518214712421471081
LOC0050_SRR6371517218169481816737
LOC0051_SRR6371516211336481133530
LOC0088_SRR6371521212663821266181
LOC0121_SRR63715202225068225031
LOC0122_SRR6371503211334081133305
LOC0127_SRR63715022241196241079
LOC0151_SRR6371505210027791002680
LOC0263_SRR6371507225192282518887
LOC0274_SRR6371506215829881582731
LOC0279_SRR6371509228948682894465
LOC0309_SRR6371508210021401002002
LOC0310_SRR63715112267664267643
LOC0311_SRR6371510211908601190747
\n", "
" ], "text/plain": [ " state reads_raw reads_passed_filter\n", "LOC0035_SRR6371515 2 906060 905955\n", "LOC0037_SRR6371514 2 2319874 2319590\n", "LOC0038_SRR6371513 2 2176494 2176182\n", "LOC0040_SRR6371512 2 758060 757942\n", "LOC0041_SRR6371519 2 1366528 1366341\n", "LOC0049_SRR6371518 2 1471242 1471081\n", "LOC0050_SRR6371517 2 1816948 1816737\n", "LOC0051_SRR6371516 2 1133648 1133530\n", "LOC0088_SRR6371521 2 1266382 1266181\n", "LOC0121_SRR6371520 2 225068 225031\n", "LOC0122_SRR6371503 2 1133408 1133305\n", "LOC0127_SRR6371502 2 241196 241079\n", "LOC0151_SRR6371505 2 1002779 1002680\n", "LOC0263_SRR6371507 2 2519228 2518887\n", "LOC0274_SRR6371506 2 1582988 1582731\n", "LOC0279_SRR6371509 2 2894868 2894465\n", "LOC0309_SRR6371508 2 1002140 1002002\n", "LOC0310_SRR6371511 2 267664 267643\n", "LOC0311_SRR6371510 2 1190860 1190747" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## access the stats of the assembly (so far) from the .stats attribute\n", "data.stats" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: Loxodonta\n", "[####################] 100% dereplicating | 0:00:22 | s3 | \n", "[####################] 100% clustering | 0:06:02 | s3 | \n", "[####################] 100% building clusters | 0:00:21 | s3 | \n", "[####################] 100% chunking | 0:00:03 | s3 | \n", "[####################] 100% aligning | 0:20:57 | s3 | \n", "[####################] 100% concatenating | 0:00:13 | s3 | \n", "[####################] 100% inferring [H, E] | 0:01:48 | s4 | \n", "[####################] 100% calculating depths | 0:00:04 | s5 | \n", "[####################] 100% chunking clusters | 0:00:04 | s5 | \n", "[####################] 100% consens calling | 0:03:03 | s5 | \n", "[####################] 100% concat/shuffle input | 0:00:04 | s6 | \n", "[####################] 100% clustering across | 0:02:13 | s6 | \n", "[####################] 100% building clusters | 0:00:03 | s6 | \n", "[####################] 100% aligning clusters | 0:01:07 | s6 | \n", "[####################] 100% database indels | 0:00:07 | s6 | \n", "[####################] 100% indexing clusters | 0:00:04 | s6 | \n", "[####################] 100% building database | 0:00:13 | s6 | \n", "[####################] 100% filtering loci | 0:00:10 | s7 | \n", "[####################] 100% building loci/stats | 0:00:03 | s7 | \n", "[####################] 100% building alleles | 0:00:04 | s7 | \n", "[####################] 100% building vcf file | 0:00:07 | s7 | \n", "[####################] 100% writing vcf file | 0:00:00 | s7 | \n", "[####################] 100% building arrays | 0:00:04 | s7 | \n", "[####################] 100% writing outfiles | 0:00:06 | s7 | \n", "Outfiles written to: ~/ipyrad/test-data/manu-ddrad-elephants/ddrad-denovo/Loxodonta_outfiles\n", "\n" ] } ], "source": [ "## run steps 3-6 of the assembly\n", "data.run(\"34567\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "submitted 4 structure jobs [quick-K-2]\n", "submitted 4 structure jobs [quick-K-3]\n", "submitted 4 structure jobs [quick-K-4]\n", "submitted 4 structure jobs [quick-K-5]\n", "submitted 4 structure jobs [quick-K-6]\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kvalues = [2, 3, 4, 5, 6]\n", "s = ipa.structure(\n", " name=\"quick\",\n", " workdir=\"./analysis-structure\",\n", " data=\"./ddrad-denovo/Loxodonta_outfiles/Loxodonta.ustr\",\n", " )\n", "\n", "## set main params (use much larger values in a real analysis)\n", "s.mainparams.burnin = 1000\n", "s.mainparams.numreps = 5000\n", "\n", "## submit N replicates of each test to run on parallel client\n", "for kpop in kvalues:\n", " s.run(kpop=kpop, nreps=4, ipyclient=ipyclient)\n", "\n", "## wait for parallel jobs to finish\n", "ipyclient.wait()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NrepsdeltaKestLnProbMeanestLnProbStdevlnPKlnPPK
240.000-5.503e+055.116e+050.0000.000e+00
343.163-3.206e+053.185e+05229741.7251.007e+06
440.455-1.098e+066.932e+05-777577.3753.152e+05
541.264-1.560e+066.283e+05-462336.8257.943e+05
640.000-1.229e+061.059e+06331935.4500.000e+00
\n", "
" ], "text/plain": [ " Nreps deltaK estLnProbMean estLnProbStdev lnPK lnPPK\n", "2 4 0.000 -5.503e+05 5.116e+05 0.000 0.000e+00\n", "3 4 3.163 -3.206e+05 3.185e+05 229741.725 1.007e+06\n", "4 4 0.455 -1.098e+06 6.932e+05 -777577.375 3.152e+05\n", "5 4 1.264 -1.560e+06 6.283e+05 -462336.825 7.943e+05\n", "6 4 0.000 -1.229e+06 1.059e+06 331935.450 0.000e+00" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## return the evanno table (deltaK) for best K \n", "etable = s.get_evanno_table(kvalues)\n", "etable" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{2: 0 1\n", "LOC0035_SRR6371515 2.510e-01 7.490e-01\n", "LOC0037_SRR6371514 4.995e-01 5.005e-01\n", "LOC0038_SRR6371513 4.972e-01 5.027e-01\n", "LOC0040_SRR6371512 1.000e-03 9.990e-01\n", "LOC0041_SRR6371519 5.835e-01 4.165e-01\n", "LOC0049_SRR6371518 9.995e-01 5.000e-04\n", "LOC0050_SRR6371517 1.000e+00 0.000e+00\n", "LOC0051_SRR6371516 9.988e-01 1.300e-03\n", "LOC0088_SRR6371521 3.583e-01 6.418e-01\n", "LOC0121_SRR6371520 2.000e-03 9.980e-01\n", "LOC0122_SRR6371503 2.700e-03 9.972e-01\n", "LOC0127_SRR6371502 1.512e-01 8.487e-01\n", "LOC0151_SRR6371505 2.515e-01 7.485e-01\n", "LOC0263_SRR6371507 8.000e-04 9.992e-01\n", "LOC0274_SRR6371506 2.507e-01 7.492e-01\n", "LOC0279_SRR6371509 4.995e-01 5.005e-01\n", "LOC0309_SRR6371508 3.872e-01 6.128e-01\n", "LOC0310_SRR6371511 2.500e-01 7.500e-01\n", "LOC0311_SRR6371510 4.992e-01 5.008e-01, 3: 0 1 2\n", "LOC0035_SRR6371515 1.500e-03 9.800e-02 9.005e-01\n", "LOC0037_SRR6371514 3.000e-04 9.995e-01 3.000e-04\n", "LOC0038_SRR6371513 3.000e-04 9.995e-01 3.000e-04\n", "LOC0040_SRR6371512 2.382e-01 2.500e-01 5.117e-01\n", "LOC0041_SRR6371519 1.500e-03 4.993e-01 4.993e-01\n", "LOC0049_SRR6371518 9.992e-01 5.000e-04 3.000e-04\n", "LOC0050_SRR6371517 1.000e+00 0.000e+00 0.000e+00\n", "LOC0051_SRR6371516 9.998e-01 0.000e+00 3.000e-04\n", "LOC0088_SRR6371521 2.520e-01 7.248e-01 2.320e-02\n", "LOC0121_SRR6371520 2.000e-03 2.510e-01 7.470e-01\n", "LOC0122_SRR6371503 2.500e-01 2.502e-01 4.997e-01\n", "LOC0127_SRR6371502 1.203e-01 3.405e-01 5.392e-01\n", "LOC0151_SRR6371505 5.620e-02 4.430e-01 5.007e-01\n", "LOC0263_SRR6371507 3.200e-03 2.300e-03 9.945e-01\n", "LOC0274_SRR6371506 2.720e-01 2.513e-01 4.767e-01\n", "LOC0279_SRR6371509 9.000e-03 2.617e-01 7.293e-01\n", "LOC0309_SRR6371508 7.132e-01 3.800e-03 2.830e-01\n", "LOC0310_SRR6371511 3.850e-02 2.833e-01 6.783e-01\n", "LOC0311_SRR6371510 4.722e-01 2.515e-01 2.762e-01, 4: 0 1 2 3\n", "LOC0035_SRR6371515 0.010 1.950e-02 0.726 2.447e-01\n", "LOC0037_SRR6371514 0.001 9.900e-01 0.007 1.800e-03\n", "LOC0038_SRR6371513 0.001 9.828e-01 0.013 3.000e-03\n", "LOC0040_SRR6371512 0.002 6.850e-02 0.454 4.760e-01\n", "LOC0041_SRR6371519 0.052 4.800e-03 0.446 4.980e-01\n", "LOC0049_SRR6371518 0.994 2.000e-03 0.003 1.000e-03\n", "LOC0050_SRR6371517 0.998 3.000e-04 0.002 3.000e-04\n", "LOC0051_SRR6371516 0.995 8.000e-04 0.003 1.500e-03\n", "LOC0088_SRR6371521 0.067 6.450e-01 0.242 4.650e-02\n", "LOC0121_SRR6371520 0.028 4.700e-01 0.471 3.070e-02\n", "LOC0122_SRR6371503 0.038 2.665e-01 0.254 4.415e-01\n", "LOC0127_SRR6371502 0.015 2.355e-01 0.299 4.500e-01\n", "LOC0151_SRR6371505 0.382 2.782e-01 0.265 7.570e-02\n", "LOC0263_SRR6371507 0.011 2.750e-02 0.536 4.257e-01\n", "LOC0274_SRR6371506 0.003 2.515e-01 0.717 2.850e-02\n", "LOC0279_SRR6371509 0.004 2.525e-01 0.249 4.945e-01\n", "LOC0309_SRR6371508 0.186 8.700e-03 0.731 7.500e-02\n", "LOC0310_SRR6371511 0.044 1.450e-02 0.025 9.158e-01\n", "LOC0311_SRR6371510 0.034 2.054e-01 0.703 5.760e-02, 5: 0 1 2 3 4\n", "LOC0035_SRR6371515 0.071 0.043 0.854 0.008 0.025\n", "LOC0037_SRR6371514 0.004 0.004 0.005 0.002 0.986\n", "LOC0038_SRR6371513 0.009 0.004 0.006 0.005 0.976\n", "LOC0040_SRR6371512 0.435 0.260 0.259 0.003 0.043\n", "LOC0041_SRR6371519 0.020 0.239 0.466 0.029 0.245\n", "LOC0049_SRR6371518 0.006 0.005 0.009 0.965 0.015\n", "LOC0050_SRR6371517 0.007 0.003 0.013 0.974 0.003\n", "LOC0051_SRR6371516 0.009 0.005 0.018 0.965 0.003\n", "LOC0088_SRR6371521 0.035 0.630 0.043 0.014 0.276\n", "LOC0121_SRR6371520 0.030 0.263 0.674 0.015 0.018\n", "LOC0122_SRR6371503 0.732 0.181 0.019 0.020 0.049\n", "LOC0127_SRR6371502 0.408 0.042 0.131 0.077 0.341\n", "LOC0151_SRR6371505 0.478 0.017 0.478 0.019 0.009\n", "LOC0263_SRR6371507 0.020 0.736 0.012 0.011 0.221\n", "LOC0274_SRR6371506 0.561 0.029 0.336 0.012 0.061\n", "LOC0279_SRR6371509 0.198 0.297 0.013 0.019 0.473\n", "LOC0309_SRR6371508 0.300 0.063 0.377 0.058 0.203\n", "LOC0310_SRR6371511 0.015 0.298 0.598 0.011 0.076\n", "LOC0311_SRR6371510 0.430 0.473 0.016 0.009 0.073, 6: 0 1 2 3 4 5\n", "LOC0035_SRR6371515 0.262 0.005 0.236 0.225 0.015 0.258\n", "LOC0037_SRR6371514 0.007 0.001 0.002 0.005 0.982 0.003\n", "LOC0038_SRR6371513 0.004 0.001 0.002 0.003 0.986 0.003\n", "LOC0040_SRR6371512 0.875 0.017 0.012 0.068 0.023 0.005\n", "LOC0041_SRR6371519 0.271 0.011 0.254 0.183 0.003 0.278\n", "LOC0049_SRR6371518 0.006 0.942 0.013 0.017 0.015 0.006\n", "LOC0050_SRR6371517 0.006 0.951 0.012 0.025 0.004 0.002\n", "LOC0051_SRR6371516 0.010 0.909 0.021 0.042 0.003 0.016\n", "LOC0088_SRR6371521 0.143 0.020 0.137 0.028 0.415 0.257\n", "LOC0121_SRR6371520 0.242 0.017 0.010 0.662 0.058 0.012\n", "LOC0122_SRR6371503 0.126 0.091 0.210 0.363 0.068 0.142\n", "LOC0127_SRR6371502 0.192 0.028 0.078 0.242 0.035 0.426\n", "LOC0151_SRR6371505 0.014 0.127 0.226 0.087 0.015 0.531\n", "LOC0263_SRR6371507 0.646 0.013 0.252 0.041 0.008 0.040\n", "LOC0274_SRR6371506 0.035 0.016 0.231 0.210 0.012 0.495\n", "LOC0279_SRR6371509 0.028 0.013 0.471 0.169 0.025 0.294\n", "LOC0309_SRR6371508 0.174 0.009 0.042 0.020 0.009 0.746\n", "LOC0310_SRR6371511 0.009 0.028 0.085 0.192 0.027 0.659\n", "LOC0311_SRR6371510 0.019 0.008 0.930 0.025 0.009 0.009}\n", " 0 1\n", "LOC0263_SRR6371507 8.000e-04 9.992e-01\n", "LOC0040_SRR6371512 1.000e-03 9.990e-01\n", "LOC0121_SRR6371520 2.000e-03 9.980e-01\n", "LOC0122_SRR6371503 2.700e-03 9.972e-01\n", "LOC0127_SRR6371502 1.512e-01 8.487e-01\n", "LOC0310_SRR6371511 2.500e-01 7.500e-01\n", "LOC0274_SRR6371506 2.507e-01 7.492e-01\n", "LOC0035_SRR6371515 2.510e-01 7.490e-01\n", "LOC0151_SRR6371505 2.515e-01 7.485e-01\n", "LOC0088_SRR6371521 3.583e-01 6.418e-01\n", "LOC0309_SRR6371508 3.872e-01 6.128e-01\n", "LOC0038_SRR6371513 4.972e-01 5.027e-01\n", "LOC0311_SRR6371510 4.992e-01 5.008e-01\n", "LOC0037_SRR6371514 4.995e-01 5.005e-01\n", "LOC0279_SRR6371509 4.995e-01 5.005e-01\n", "LOC0041_SRR6371519 5.835e-01 4.165e-01\n", "LOC0051_SRR6371516 9.988e-01 1.300e-03\n", "LOC0049_SRR6371518 9.995e-01 5.000e-04\n", "LOC0050_SRR6371517 1.000e+00 0.000e+00\n" ] }, { "data": { "text/html": [ "
LOC0263_SRR6371507LOC0040_SRR6371512LOC0121_SRR6371520LOC0122_SRR6371503LOC0127_SRR6371502LOC0310_SRR6371511LOC0274_SRR6371506LOC0035_SRR6371515LOC0151_SRR6371505LOC0088_SRR6371521LOC0309_SRR6371508LOC0038_SRR6371513LOC0311_SRR6371510LOC0037_SRR6371514LOC0279_SRR6371509LOC0041_SRR6371519LOC0051_SRR6371516LOC0049_SRR6371518LOC0050_SRR6371517LOC0263_SRR6371507LOC0040_SRR6371512LOC0121_SRR6371520LOC0122_SRR6371503LOC0127_SRR6371502LOC0310_SRR6371511LOC0274_SRR6371506LOC0035_SRR6371515LOC0151_SRR6371505LOC0088_SRR6371521LOC0309_SRR6371508LOC0038_SRR6371513LOC0311_SRR6371510LOC0037_SRR6371514LOC0279_SRR6371509LOC0041_SRR6371519LOC0051_SRR6371516LOC0049_SRR6371518LOC0050_SRR63715170.00.51.0
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "## set some clumpp params\n", "s.clumppparams.m = 3 ## use largegreedy algorithm\n", "s.clumppparams.greedy_option = 2 ## test nrepeat possible orders\n", "s.clumppparams.repeats = 10000 ## number of repeats\n", "s.clumppparams\n", "## run clumpp for each value of K\n", "#tables = s.get_clumpp_table(kvalues, quiet=True)\n", "print(tables)\n", "#table = tables[4].sort_values(by=[0, 1, 2, 3])\n", "table = tables[2].sort_values(by=[0, 1])\n", "toyplot.bars(\n", " table,\n", " width=500, \n", " height=200,\n", " title=[[i] for i in table.index.tolist()],\n", " xshow=False,\n", ");\n", "print(table)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.14" } }, "nbformat": 4, "nbformat_minor": 2 }