{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Simulated RAD-seq data for ipyrad testing\n", "The simulations software simrrls is available at [github.com/dereneaton/simrrls](github.com/dereneaton/simrrls). \n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "simrrls v.0.0.13\n", "ipyrad v.0.3.42\n" ] } ], "source": [ "import os\n", "import glob\n", "import gzip\n", "import itertools\n", "import numpy as np\n", "import simrrls\n", "import ipyrad as ip\n", "\n", "print \"simrrls v.{}\".format(simrrls.__version__)\n", "print \"ipyrad v.{}\".format(ip.__version__)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Organization" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [], "source": [ "## the dir to write data to\n", "DIR = \"./ipsimdata/\"\n", "\n", "## if it doesn't exist make it\n", "if not os.path.exists(DIR):\n", " os.makedirs(DIR)\n", " \n", "## if theres anything in it, remove it\n", "for oldfile in glob.glob(os.path.join(DIR, \"*\")):\n", " os.remove(oldfile)\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Simulate the data with simrrls" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\tmin insert size allows read overlaps/adapter sequences \n", "\tmin insert size allows read overlaps/adapter sequences \n" ] } ], "source": [ "%%bash -s $DIR\n", "\n", "## simulate rad_example (includes indels)\n", "simrrls -o $1/rad_example -f rad -dm 10 -ds 2 -I 0.05 -L 1000\n", "\n", "## simulate larger rad data set\n", "#simrrls -o $1/rad_large -f rad -dm 10 -ds 2 -I 0.01 -L 10000\n", "\n", "## simulate pairddrad_example\n", "simrrls -o $1/gbs_example -f gbs -dm 10 -ds 2 -I 0.05 -L 1000\n", "\n", "## simulate pairddrad_example\n", "simrrls -o $1/pairddrad_example -f pairddrad -dm 10 -ds 2 -L 1000\n", "\n", "## simulate pairgbs_example\n", "simrrls -o $1/pairgbs_example -f pairgbs -dm 10 -ds 2 -L 1000\n", "\n", "## simulate pairddrad_wmerge_example \n", "simrrls -o $1/pairddrad_wmerge_example -f pairddrad -dm 10 -ds 2 \\\n", " -i1 -50 -i2 100 -L 1000\n", "\n", "## simulate pairgbs_wmerge_example (mixture of merged and non-merged reads)\n", "simrrls -o $1/pairgbs_wmerge_example -f pairgbs -dm 10 -ds 2 \\\n", " -i1 -50 -i2 100 -L 1000" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Functions to insert reads into simulated genome" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import itertools\n", "import gzip\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## Utility function\n", "def revcomp(sequence):\n", " \"returns reverse complement of a string\"\n", " sequence = sequence[::-1].strip()\\\n", " .replace(\"A\", \"t\")\\\n", " .replace(\"T\", \"a\")\\\n", " .replace(\"C\", \"g\")\\\n", " .replace(\"G\", \"c\").upper()\n", " return sequence" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def RAD_to_genome(R1s, R2s, n_inserts, insert_low, insert_high, out_chr):\n", " \"\"\" \n", " Create a simulated genome file with RAD data interspersed.\n", " \"\"\"\n", " \n", " ## start genome fasta sequence with 5000 random bp\n", " genome = np.random.choice(list(\"ATGC\"), 5000)\n", " \n", " ## read in the RAD data files\n", " dat1 = gzip.open(R1s, 'r')\n", " qiter1 = itertools.izip(*[iter(dat1)]*4)\n", " if R2s:\n", " dat2 = gzip.open(R2s, 'r')\n", " qiter2 = itertools.izip(*[iter(dat2)]*4)\n", " else:\n", " qiter2 = itertools.izip(*[iter(str, 1)]*4)\n", " \n", " ## sample unique reads from rads\n", " uniqs = []\n", " locid = 0\n", " while len(uniqs) < n_inserts:\n", " ## grab a read and get locus id\n", " qrt1 = qiter1.next()\n", " qrt2 = qiter2.next()\n", " iloc = []\n", " ilocid = int(qrt1[0].split(\"_\")[1][5:])\n", " \n", " ## go until end of locus copies\n", " while ilocid == locid:\n", " iloc.append([qrt1[1].strip(), qrt2[1].strip()])\n", " qrt1 = qiter1.next()\n", " qrt2 = qiter2.next()\n", " ilocid = int(qrt1[0].split(\"_\")[1][5:])\n", " \n", " ## sample one read\n", " uniqs.append(iloc[0])\n", " locid += 1\n", " \n", " ## insert RADs into genome\n", " for ins in range(n_inserts): \n", " ## get read, we leave the barcode on cuz it won't hurt\n", " r1 = np.array(list(uniqs[ins][0]))\n", " r2 = np.array(list(revcomp(uniqs[ins][1])))\n", " \n", " ## add R1 to genome\n", " genome = np.concatenate((genome, r1), axis=0)\n", " if qrt2[0]:\n", " ## add insert size\n", " howlong = np.random.uniform(insert_low, insert_high)\n", " chunk = np.random.choice(list(\"ATGC\"), int(howlong))\n", " #print howlong\n", " genome = np.concatenate((genome, chunk), axis=0)\n", " ## add read2\n", " genome = np.concatenate((genome, r2), axis=0)\n", "\n", " ## add spacer between loci\n", " genome = np.concatenate((genome, np.random.choice(list(\"ATGC\"), 5000)), axis=0)\n", "\n", " print(\"input genome is {} bp\".format(genome.shape[0]))\n", " print(\"inserted {} loci into genome\".format(n_inserts))\n", "\n", " with open(out_chr, \"w\") as fasta:\n", " header = \">MT dna:chromosome chromosome:test:MT:1:{}:1 REF\\n\"\\\n", " .format(genome.shape[0])\n", " fasta.write(header)\n", " outseq = list(genome)\n", " for i in range(0, len(outseq), 70):\n", " fasta.write(\"{}\\n\".format(\"\".join(outseq[i:i+70])))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Make 'genome' data files" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [], "source": [ "## Meta args\n", "N_INSERTS = 500\n", "INSERT_LOW = 250\n", "INSERT_HIGH = 300" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "input genome is 2555000 bp\n", "inserted 500 loci into genome\n" ] } ], "source": [ "## SE RAD data\n", "DATA_R1 = os.path.join(DIR, \"rad_example_R1_.fastq.gz\")\n", "OUTPUT_CHR = os.path.join(DIR, \"rad_example_genome.fa\")\n", "big = RAD_to_genome(DATA_R1, 0, N_INSERTS, \n", " INSERT_LOW, INSERT_HIGH,\n", " OUTPUT_CHR)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "input genome is 2555000 bp\n", "inserted 500 loci into genome\n" ] } ], "source": [ "## SE GBS data\n", "DATA_R1 = os.path.join(DIR, \"gbs_example_R1_.fastq.gz\")\n", "OUTPUT_CHR = os.path.join(DIR, \"gbs_example_genome.fa\")\n", "RAD_to_genome(DATA_R1, 0, N_INSERTS,\n", " INSERT_LOW, INSERT_HIGH, \n", " OUTPUT_CHR)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "input genome is 2742124 bp\n", "inserted 500 loci into genome\n" ] } ], "source": [ "## PAIR ddRAD data \n", "DATA_R1 = os.path.join(DIR, \"pairddrad_example_R1_.fastq.gz\")\n", "DATA_R2 = os.path.join(DIR, \"pairddrad_example_R2_.fastq.gz\")\n", "OUTPUT_CHR = os.path.join(DIR, \"pairddrad_example_genome.fa\")\n", "RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, \n", " INSERT_LOW, INSERT_HIGH,\n", " OUTPUT_CHR)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "input genome is 2742116 bp\n", "inserted 500 loci into genome\n" ] } ], "source": [ "## PAIR ddRAD data \n", "DATA_R1 = os.path.join(DIR, \"pairddrad_wmerge_example_R1_.fastq.gz\")\n", "DATA_R2 = os.path.join(DIR, \"pairddrad_wmerge_example_R2_.fastq.gz\")\n", "OUTPUT_CHR = os.path.join(DIR, \"pairddrad_wmerge_example_genome.fa\")\n", "RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, \n", " INSERT_LOW, INSERT_HIGH, \n", " OUTPUT_CHR)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "input genome is 2743007 bp\n", "inserted 500 loci into genome\n" ] } ], "source": [ "## PAIR GBS data\n", "DATA_R1 = os.path.join(DIR, \"pairgbs_wmerge_example_R1_.fastq.gz\")\n", "DATA_R2 = os.path.join(DIR, \"pairgbs_wmerge_example_R2_.fastq.gz\")\n", "OUTPUT_CHR = os.path.join(DIR, \"pairgbs_wmerge_example_genome.fa\")\n", "RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, \n", " INSERT_LOW, INSERT_HIGH, \n", " OUTPUT_CHR)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Run tests in ipyrad\n", "### rad_example" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " New Assembly: denovo\n", "\n", " Assembly: denovo\n", "\n", " [force] overwriting fastq files previously *created by ipyrad* in:\n", " /home/deren/Documents/ipyrad/tests/tests1/denovo_fastqs\n", " This does not affect your *original/raw data files*\n", " [####################] 100% chunking large files | 0:00:00 \n", " [####################] 100% sorting reads | 0:00:03 \n", " [####################] 100% writing/compressing | 0:00:00 \n", "\n", " [####################] 100% processing reads | 0:00:04 \n", "\n", " [####################] 100% dereplicating | 0:00:00 \n", " [####################] 100% clustering | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% chunking | 0:00:00 \n", " [####################] 100% aligning | 0:00:44 \n", " [####################] 100% concatenating | 0:00:00 \n", "\n", " [####################] 100% inferring [H, E] | 0:00:55 \n", "\n", " [####################] 100% consensus calling | 0:00:26 \n", " [####################] 100% concat/shuffle input | 0:00:00 \n", " [####################] 100% clustering across | 0:00:01 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% aligning clusters | 0:00:08 \n", " [####################] 100% database indels | 0:00:00 \n", " [####################] 100% indexing clusters | 0:00:01 \n", " [####################] 100% building database | 0:00:00 \n", " [####################] 100% filtering loci | 0:00:01 \n", " [####################] 100% building loci/stats | 0:00:01 \n", " [####################] 100% building vcf file | 0:00:03 \n", " [####################] 100% writing vcf file | 0:00:01 \n", " [####################] 100% writing outfiles | 0:00:01 \n", " Outfiles written to: ~/Documents/ipyrad/tests/tests1/denovo_outfiles\n", "\n", " Assembly: reference\n", "\n", " [force] overwriting fastq files previously *created by ipyrad* in:\n", " /home/deren/Documents/ipyrad/tests/tests1/reference_fastqs\n", " This does not affect your *original/raw data files*\n", " [####################] 100% chunking large files | 0:00:00 \n", " [####################] 100% sorting reads | 0:00:03 \n", " [####################] 100% writing/compressing | 0:00:00 \n", "\n", " [####################] 100% processing reads | 0:00:04 \n", "\n", " [####################] 100% dereplicating | 0:00:00 \n", " [####################] 100% mapping | 0:00:45 \n", " [####################] 100% finalize mapping | 0:00:14 \n", " [####################] 100% chunking | 0:00:00 \n", " [####################] 100% aligning | 0:00:31 \n", " [####################] 100% concatenating | 0:00:00 \n", "\n", " [####################] 100% inferring [H, E] | 0:00:41 \n", "\n", " [####################] 100% consensus calling | 0:00:15 \n", " [####################] 100% concat/shuffle input | 0:00:00 \n", " [####################] 100% clustering across | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% aligning clusters | 0:00:04 \n", " [####################] 100% database indels | 0:00:00 \n", " [####################] 100% indexing clusters | 0:00:00 \n", " [####################] 100% building database | 0:00:00 \n", " [####################] 100% filtering loci | 0:00:01 \n", " [####################] 100% building loci/stats | 0:00:01 \n", " [####################] 100% building vcf file | 0:00:02 \n", " [####################] 100% writing vcf file | 0:00:01 \n", " [####################] 100% writing outfiles | 0:00:01 \n", " Outfiles written to: ~/Documents/ipyrad/tests/tests1/reference_outfiles\n", "\n", " Assembly: denovo-plus\n", "\n", " [force] overwriting fastq files previously *created by ipyrad* in:\n", " /home/deren/Documents/ipyrad/tests/tests1/denovo-plus_fastqs\n", " This does not affect your *original/raw data files*\n", " [####################] 100% chunking large files | 0:00:00 \n", " [####################] 100% sorting reads | 0:00:03 \n", " [####################] 100% writing/compressing | 0:00:00 \n", "\n", " [####################] 100% processing reads | 0:00:04 \n", "\n", " Force reindexing of reference sequence\n", " [####################] 100% dereplicating | 0:00:00 \n", " [####################] 100% mapping | 0:00:46 \n", " [####################] 100% clustering | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% finalize mapping | 0:00:15 \n", " [####################] 100% chunking | 0:00:00 \n", " [####################] 100% aligning | 0:00:55 \n", " [####################] 100% concatenating | 0:00:00 \n", "\n", " [####################] 100% inferring [H, E] | 0:00:59 \n", "\n", " [####################] 100% consensus calling | 0:00:29 \n", " [####################] 100% concat/shuffle input | 0:00:00 \n", " [####################] 100% clustering across | 0:00:01 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% aligning clusters | 0:00:10 \n", " [####################] 100% database indels | 0:00:00 \n", " [####################] 100% indexing clusters | 0:00:01 \n", " [####################] 100% building database | 0:00:00 \n", " [####################] 100% filtering loci | 0:00:01 \n", " [####################] 100% building loci/stats | 0:00:01 \n", " [####################] 100% building vcf file | 0:00:03 \n", " [####################] 100% writing vcf file | 0:00:01 \n", " [####################] 100% writing outfiles | 0:00:01 \n", " Outfiles written to: ~/Documents/ipyrad/tests/tests1/denovo-plus_outfiles\n", "\n", " Assembly: denovo-minus\n", "\n", " [force] overwriting fastq files previously *created by ipyrad* in:\n", " /home/deren/Documents/ipyrad/tests/tests1/denovo-minus_fastqs\n", " This does not affect your *original/raw data files*\n", " [####################] 100% chunking large files | 0:00:00 \n", " [####################] 100% sorting reads | 0:00:03 \n", " [####################] 100% writing/compressing | 0:00:00 \n", "\n", " [####################] 100% processing reads | 0:00:04 \n", "\n", " Force reindexing of reference sequence\n", " [####################] 100% dereplicating | 0:00:00 \n", " [####################] 100% mapping | 0:00:12 \n", " [####################] 100% clustering | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% chunking | 0:00:00 \n", " [####################] 100% aligning | 0:00:26 \n", " [####################] 100% concatenating | 0:00:00 \n", "\n", " [####################] 100% inferring [H, E] | 0:00:42 \n", "\n", " [####################] 100% consensus calling | 0:00:16 \n", " [####################] 100% concat/shuffle input | 0:00:00 \n", " [####################] 100% clustering across | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% aligning clusters | 0:00:05 \n", " [####################] 100% database indels | 0:00:00 \n", " [####################] 100% indexing clusters | 0:00:01 \n", " [####################] 100% building database | 0:00:00 \n", " [####################] 100% filtering loci | 0:00:01 \n", " [####################] 100% building loci/stats | 0:00:01 \n", " [####################] 100% building vcf file | 0:00:02 \n", " [####################] 100% writing vcf file | 0:00:01 \n", " [####################] 100% writing outfiles | 0:00:01 \n", " Outfiles written to: ~/Documents/ipyrad/tests/tests1/denovo-minus_outfiles\n" ] } ], "source": [ "## create an assembly for denovo\n", "data1 = ip.Assembly(\"denovo\")\n", "data1.set_params(1, \"tests1\")\n", "data1.set_params(2, os.path.join(DIR, 'rad_example_R1_.fastq.gz'))\n", "data1.set_params(3, os.path.join(DIR, 'rad_example_barcodes.txt'))\n", "data1.set_params(\"max_alleles_consens\", 4) ## raise this for now \n", "\n", "## branch into an assembly for reference\n", "data2 = data1.branch(\"reference\")\n", "data2.set_params(5, 'reference')\n", "data2.set_params(6, os.path.join(DIR, 'rad_example_genome.fa'))\n", "\n", "## branch into an assembly for denovo+reference\n", "data3 = data2.branch(\"denovo-plus\")\n", "data3.set_params(5, 'denovo+reference')\n", "\n", "## branch into an assembly for reference\n", "data4 = data2.branch(\"denovo-minus\")\n", "data4.set_params(5, 'denovo-reference')\n", "\n", "## assemble both\n", "data1.run(\"1234567\", force=True)\n", "data2.run(\"1234567\", force=True)\n", "data3.run(\"1234567\", force=True)\n", "data4.run(\"1234567\", force=True)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [], "source": [ "NLOCI = 1000\n", "\n", "## check results\n", "assert all(data1.stats.clusters_hidepth == NLOCI)\n", "assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI\n", "\n", "assert all(data2.stats.clusters_hidepth == N_INSERTS)\n", "assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS\n", "\n", "assert all(data3.stats.clusters_hidepth == NLOCI)\n", "assert data3.stats_dfs.s7_loci.sum_coverage.max() == NLOCI\n", "\n", "assert all(data4.stats.clusters_hidepth == NLOCI-N_INSERTS)\n", "assert data4.stats_dfs.s7_loci.sum_coverage.max() == NLOCI-N_INSERTS" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### gbs example" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " New Assembly: denovo\n", "\n", " Assembly: denovo\n", "\n", " [####################] 100% chunking large files | 0:00:00 \n", " [####################] 100% sorting reads | 0:00:03 \n", " [####################] 100% writing/compressing | 0:00:00 \n", "\n", " [####################] 100% processing reads | 0:00:33 \n", "\n", " [####################] 100% dereplicating | 0:00:00 \n", " [####################] 100% clustering | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% chunking | 0:00:00 \n", " [####################] 100% aligning | 0:00:51 \n", " [####################] 100% concatenating | 0:00:00 \n", "\n", " [####################] 100% inferring [H, E] | 0:00:51 \n", "\n", " [####################] 100% consensus calling | 0:00:24 \n", " [####################] 100% concat/shuffle input | 0:00:00 \n", " [####################] 100% clustering across | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% aligning clusters | 0:00:08 \n", " [####################] 100% indexing clusters | 0:00:03 \n", " [####################] 100% building database | 0:00:05 \n", " [####################] 100% filtering loci | 0:00:04 \n", " [####################] 100% building loci/stats | 0:00:01 \n", " [####################] 100% building vcf file | 0:00:14 \n", " [####################] 100% writing outfiles | 0:00:01 \n", " Outfiles written to: ~/Documents/ipyrad/tests/tests2/denovo_outfiles\n", "\n", " Assembly: reference\n", "\n", " [####################] 100% chunking large files | 0:00:00 \n", " [####################] 100% sorting reads | 0:00:03 \n", " [####################] 100% writing/compressing | 0:00:00 \n", "\n", " [####################] 100% processing reads | 0:00:32 \n", "\n", " [####################] 100% dereplicating | 0:00:00 \n", " [####################] 100% mapping | 0:00:55 \n", " [####################] 100% finalize mapping | 0:00:20 \n", " [####################] 100% chunking | 0:00:00 \n", " [####################] 100% aligning | 0:00:37 \n", " [####################] 100% concatenating | 0:00:00 \n", "\n", " [####################] 100% inferring [H, E] | 0:00:37 \n", "\n", " [####################] 100% consensus calling | 0:00:14 \n", " [####################] 100% concat/shuffle input | 0:00:00 \n", " [####################] 100% clustering across | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% aligning clusters | 0:00:04 \n", " [####################] 100% indexing clusters | 0:00:03 \n", " [####################] 100% building database | 0:00:02 \n", " [####################] 100% filtering loci | 0:00:00 \n", " [####################] 100% building loci/stats | 0:00:01 \n", " [####################] 100% building vcf file | 0:00:07 \n", " [####################] 100% writing outfiles | 0:00:01 \n", " Outfiles written to: ~/Documents/ipyrad/tests/tests2/reference_outfiles\n", "\n", " Assembly: denovo-plus\n", "\n", " [####################] 100% chunking large files | 0:00:00 \n", " [####################] 100% sorting reads | 0:00:03 \n", " [####################] 100% writing/compressing | 0:00:00 \n", "\n", " [####################] 100% processing reads | 0:00:34 \n", "\n", " Force reindexing of reference sequence\n", " [####################] 100% dereplicating | 0:00:00 \n", " [####################] 100% mapping | 0:00:54 \n", " [####################] 100% clustering | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% finalize mapping | 0:00:21 \n", " [####################] 100% chunking | 0:00:00 \n", " [####################] 100% aligning | 0:01:06 \n", " [####################] 100% concatenating | 0:00:00 \n", "\n", " [####################] 100% inferring [H, E] | 0:00:55 \n", "\n", " [####################] 100% consensus calling | 0:00:26 \n", " [####################] 100% concat/shuffle input | 0:00:00 \n", " [####################] 100% clustering across | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% aligning clusters | 0:00:10 \n", " [####################] 100% indexing clusters | 0:00:03 \n", " [####################] 100% building database | 0:00:05 \n", " [####################] 100% filtering loci | 0:00:00 \n", " [####################] 100% building loci/stats | 0:00:01 \n", " [####################] 100% building vcf file | 0:00:14 \n", " [####################] 100% writing outfiles | 0:00:01 \n", " Outfiles written to: ~/Documents/ipyrad/tests/tests2/denovo-plus_outfiles\n", "\n", " Assembly: denovo-minus\n", "\n", " [####################] 100% chunking large files | 0:00:00 \n", " [####################] 100% sorting reads | 0:00:03 \n", " [####################] 100% writing/compressing | 0:00:00 \n", "\n", " [####################] 100% processing reads | 0:00:35 \n", "\n", " Force reindexing of reference sequence\n", " [####################] 100% dereplicating | 0:00:00 \n", " [####################] 100% mapping | 0:00:11 \n", " [####################] 100% clustering | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% chunking | 0:00:00 \n", " [####################] 100% aligning | 0:00:27 \n", " [####################] 100% concatenating | 0:00:00 \n", "\n", " [####################] 100% inferring [H, E] | 0:00:37 \n", "\n", " [####################] 100% consensus calling | 0:00:15 \n", " [####################] 100% concat/shuffle input | 0:00:00 \n", " [####################] 100% clustering across | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% aligning clusters | 0:00:04 \n", " [####################] 100% indexing clusters | 0:00:03 \n", " [####################] 100% building database | 0:00:02 \n", " [####################] 100% filtering loci | 0:00:04 \n", " [####################] 100% building loci/stats | 0:00:01 \n", " [####################] 100% building vcf file | 0:00:07 \n", " [####################] 100% writing outfiles | 0:00:01 \n", " Outfiles written to: ~/Documents/ipyrad/tests/tests2/denovo-minus_outfiles\n" ] } ], "source": [ "## create an assembly for denovo\n", "data1 = ip.Assembly(\"denovo\")\n", "data1.set_params(1, \"tests2\")\n", "data1.set_params(2, os.path.join(DIR, \"gbs_example_R1_.fastq.gz\"))\n", "data1.set_params(3, os.path.join(DIR, \"gbs_example_barcodes.txt\"))\n", "data1.set_params(\"max_alleles_consens\", 4) ## raise this for now \n", "\n", "## branch into an assembly for reference\n", "data2 = data1.branch(\"reference\")\n", "data2.set_params(5, \"reference\")\n", "data2.set_params(6, os.path.join(DIR, \"gbs_example_genome.fa\"))\n", "\n", "## branch into an assembly for denovo+reference\n", "data3 = data2.branch(\"denovo-plus\")\n", "data3.set_params(5, 'denovo+reference')\n", "\n", "## branch into an assembly for reference\n", "data4 = data2.branch(\"denovo-minus\")\n", "data4.set_params(5, 'denovo-reference')\n", "\n", "## assemble both\n", "data1.run(\"1234567\", force=True)\n", "data2.run(\"1234567\", force=True)\n", "data3.run(\"1234567\", force=True)\n", "data4.run(\"1234567\", force=True)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [], "source": [ "## check results\n", "assert all(data1.stats.clusters_hidepth == 1000)\n", "assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000\n", "\n", "assert all(data2.stats.clusters_hidepth == N_INSERTS)\n", "assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS\n", "\n", "assert all(data3.stats.clusters_hidepth == 1000)\n", "assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000\n", "\n", "assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)\n", "assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### pairddrad without merged reads example" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " New Assembly: denovo\n", "\n", " Assembly: denovo\n", "\n", " [force] overwriting fastq files previously *created by ipyrad* in:\n", " /home/deren/Documents/ipyrad/tests/tests3/denovo_fastqs\n", " This does not affect your *original/raw data files*\n", " [####################] 100% chunking large files | 0:00:00 \n", " [####################] 100% sorting reads | 0:00:05 \n", " [####################] 100% writing/compressing | 0:00:01 \n", "\n", " [####################] 100% processing reads | 0:00:07 \n", "\n", " [####################] 100% dereplicating | 0:00:01 \n", " [####################] 100% clustering | 0:00:06 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% chunking | 0:00:00 \n", " [####################] 100% aligning | 0:02:00 \n", " [####################] 100% concatenating | 0:00:00 \n", "\n", " [####################] 100% inferring [H, E] | 0:01:14 \n", "\n", " [####################] 100% consensus calling | 0:00:51 \n", " [####################] 100% concat/shuffle input | 0:00:00 \n", " [####################] 100% clustering across | 0:00:02 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% aligning clusters | 0:00:16 \n", " [####################] 100% database indels | 0:00:00 \n", " [####################] 100% indexing clusters | 0:00:01 \n", " [####################] 100% building database | 0:00:00 \n", " [####################] 100% filtering loci | 0:00:01 \n", " [####################] 100% building loci/stats | 0:00:01 \n", " [####################] 100% building vcf file | 0:00:03 \n", " [####################] 100% writing vcf file | 0:00:01 \n", " [####################] 100% writing outfiles | 0:00:01 \n", " Outfiles written to: ~/Documents/ipyrad/tests/tests3/denovo_outfiles\n", "\n", " Assembly: reference\n", "\n", " [force] overwriting fastq files previously *created by ipyrad* in:\n", " /home/deren/Documents/ipyrad/tests/tests3/reference_fastqs\n", " This does not affect your *original/raw data files*\n", " [####################] 100% chunking large files | 0:00:00 \n", " [####################] 100% sorting reads | 0:00:05 \n", " [####################] 100% writing/compressing | 0:00:01 \n", "\n", " [####################] 100% processing reads | 0:00:06 \n", "\n", " Force reindexing of reference sequence\n", " [####################] 100% dereplicating | 0:00:01 \n", " [####################] 100% mapping | 0:02:04 \n", " [####################] 100% finalize mapping | 0:00:36 \n", " [####################] 100% chunking | 0:00:00 \n", " [####################] 100% aligning | 0:01:03 \n", " [####################] 100% concatenating | 0:00:00 \n", "\n", " [####################] 100% inferring [H, E] | 0:00:55 \n", "\n", " [####################] 100% consensus calling | 0:00:29 \n", " [####################] 100% concat/shuffle input | 0:00:00 \n", " [####################] 100% clustering across | 0:00:01 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% aligning clusters | 0:00:09 \n", " [####################] 100% database indels | 0:00:00 \n", " [####################] 100% indexing clusters | 0:00:01 \n", " [####################] 100% building database | 0:00:00 \n", " [####################] 100% filtering loci | 0:00:01 \n", " [####################] 100% building loci/stats | 0:00:01 \n", " [####################] 100% building vcf file | 0:00:02 \n", " [####################] 100% writing vcf file | 0:00:01 \n", " [####################] 100% writing outfiles | 0:00:01 \n", " Outfiles written to: ~/Documents/ipyrad/tests/tests3/reference_outfiles\n", "\n", " Assembly: denovo-plus\n", "\n", " [force] overwriting fastq files previously *created by ipyrad* in:\n", " /home/deren/Documents/ipyrad/tests/tests3/denovo-plus_fastqs\n", " This does not affect your *original/raw data files*\n", " [####################] 100% chunking large files | 0:00:00 \n", " [####################] 100% sorting reads | 0:00:06 \n", " [####################] 100% writing/compressing | 0:00:01 \n", "\n", " [####################] 100% processing reads | 0:00:07 \n", "\n", " Force reindexing of reference sequence\n", " [####################] 100% dereplicating | 0:00:00 \n", " [####################] 100% mapping | 0:02:14 \n", " [####################] 100% clustering | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% finalize mapping | 0:00:37 \n", " [####################] 100% chunking | 0:00:00 \n", " [####################] 100% aligning | 0:02:15 \n", " [####################] 100% concatenating | 0:00:00 \n", "\n", " [####################] 100% inferring [H, E] | 0:01:32 \n", "\n", " [####################] 100% consensus calling | 0:00:55 \n", " [####################] 100% concat/shuffle input | 0:00:00 \n", " [####################] 100% clustering across | 0:00:02 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% aligning clusters | 0:00:17 \n", " [####################] 100% database indels | 0:00:00 \n", " [####################] 100% indexing clusters | 0:00:01 \n", " [####################] 100% building database | 0:00:00 \n", " [####################] 100% filtering loci | 0:00:01 \n", " [####################] 100% building loci/stats | 0:00:01 \n", " [####################] 100% building vcf file | 0:00:03 \n", " [####################] 100% writing vcf file | 0:00:01 \n", " [####################] 100% writing outfiles | 0:00:01 \n", " Outfiles written to: ~/Documents/ipyrad/tests/tests3/denovo-plus_outfiles\n", "\n", " Assembly: denovo-minus\n", "\n", " [force] overwriting fastq files previously *created by ipyrad* in:\n", " /home/deren/Documents/ipyrad/tests/tests3/denovo-minus_fastqs\n", " This does not affect your *original/raw data files*\n", " [####################] 100% chunking large files | 0:00:00 \n", " [####################] 100% sorting reads | 0:00:05 \n", " [####################] 100% writing/compressing | 0:00:01 \n", "\n", " [####################] 100% processing reads | 0:00:07 \n", "\n", " Force reindexing of reference sequence\n", " [####################] 100% dereplicating | 0:00:01 \n", " [####################] 100% mapping | 0:00:52 \n", " [####################] 100% clustering | 0:00:00 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% chunking | 0:00:00 \n", " [####################] 100% aligning | 0:01:03 \n", " [####################] 100% concatenating | 0:00:00 \n", "\n", " [####################] 100% inferring [H, E] | 0:01:02 \n", "\n", " [####################] 100% consensus calling | 0:00:30 \n", " [####################] 100% concat/shuffle input | 0:00:00 \n", " [####################] 100% clustering across | 0:00:01 \n", " [####################] 100% building clusters | 0:00:00 \n", " [####################] 100% aligning clusters | 0:00:09 \n", " [####################] 100% database indels | 0:00:00 \n", " [####################] 100% indexing clusters | 0:00:01 \n", " [####################] 100% building database | 0:00:00 \n", " [####################] 100% filtering loci | 0:00:01 \n", " [####################] 100% building loci/stats | 0:00:01 \n", " [####################] 100% building vcf file | 0:00:02 \n", " [####################] 100% writing vcf file | 0:00:01 \n", " [####################] 100% writing outfiles | 0:00:01 \n", " Outfiles written to: ~/Documents/ipyrad/tests/tests3/denovo-minus_outfiles\n" ] } ], "source": [ "## create an assembly for denovo\n", "data1 = ip.Assembly(\"denovo\")\n", "data1.set_params(1, \"tests3\")\n", "data1.set_params(2, os.path.join(DIR, \"pairddrad_example_*.fastq.gz\"))\n", "data1.set_params(3, os.path.join(DIR, \"pairddrad_example_barcodes.txt\"))\n", "data1.set_params(7, \"pairddrad\")\n", "data1.set_params(8, (\"TGCAG\", \"CCGG\"))\n", "\n", "## branch into an assembly for reference\n", "data2 = data1.branch(\"reference\")\n", "data2.set_params(5, \"reference\")\n", "data2.set_params(6, os.path.join(DIR, \"pairddrad_example_genome.fa\"))\n", "\n", "## branch into an assembly for denovo+reference\n", "data3 = data2.branch(\"denovo-plus\")\n", "data3.set_params(5, 'denovo+reference')\n", "\n", "## branch into an assembly for reference\n", "data4 = data2.branch(\"denovo-minus\")\n", "data4.set_params(5, 'denovo-reference')\n", "\n", "## assemble both\n", "data1.run(\"1234567\", force=True)\n", "data2.run(\"1234567\", force=True)\n", "data3.run(\"1234567\", force=True)\n", "data4.run(\"1234567\", force=True)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "ename": "AssertionError", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclusters_hidepth\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mN_INSERTS\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mdata2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats_dfs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0ms7_loci\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum_coverage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mN_INSERTS\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclusters_hidepth\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mAssertionError\u001b[0m: " ] } ], "source": [ "## check results\n", "assert all(data1.stats.clusters_hidepth == 1000)\n", "assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000\n", "\n", "assert all(data2.stats.clusters_hidepth == N_INSERTS)\n", "assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS\n", "\n", "assert all(data3.stats.clusters_hidepth == 1000)\n", "assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000\n", "\n", "assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)\n", "assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "s1 : reads_raw\n", "1A_0 19835\n", "1B_0 20071\n", "1C_0 19969\n", "1D_0 20082\n", "2E_0 20004\n", "2F_0 19899\n", "2G_0 19928\n", "2H_0 20110\n", "3I_0 20078\n", "3J_0 19965\n", "3K_0 19846\n", "3L_0 20025\n", "s2 : reads_raw trim_adapter_bp_read1 trim_adapter_bp_read2 \\\n", "1A_0 19835 0 0 \n", "1B_0 20071 0 0 \n", "1C_0 19969 0 0 \n", "1D_0 20082 0 0 \n", "2E_0 20004 0 0 \n", "2F_0 19899 0 0 \n", "2G_0 19928 0 0 \n", "2H_0 20110 0 0 \n", "3I_0 20078 0 0 \n", "3J_0 19965 0 0 \n", "3K_0 19846 0 0 \n", "3L_0 20025 0 0 \n", "\n", " trim_quality_bp_read1 trim_quality_bp_read2 reads_filtered_by_Ns \\\n", "1A_0 0 0 0 \n", "1B_0 0 0 0 \n", "1C_0 0 0 0 \n", "1D_0 0 0 0 \n", "2E_0 0 0 0 \n", "2F_0 0 0 0 \n", "2G_0 0 0 0 \n", "2H_0 0 0 0 \n", "3I_0 0 0 0 \n", "3J_0 0 0 0 \n", "3K_0 0 0 0 \n", "3L_0 0 0 0 \n", "\n", " reads_filtered_by_minlen reads_passed_filter \n", "1A_0 0 19835 \n", "1B_0 0 20071 \n", "1C_0 0 19969 \n", "1D_0 0 20082 \n", "2E_0 0 20004 \n", "2F_0 0 19899 \n", "2G_0 0 19928 \n", "2H_0 0 20110 \n", "3I_0 0 20078 \n", "3J_0 0 19965 \n", "3K_0 0 19846 \n", "3L_0 0 20025 \n", "s3 : merged_pairs clusters_total clusters_hidepth avg_depth_total \\\n", "1A_0 19835 500 500 19.956 \n", "1B_0 20071 500 500 20.144 \n", "1C_0 19969 500 500 20.108 \n", "1D_0 20082 500 500 19.914 \n", "2E_0 20004 500 500 19.994 \n", "2F_0 19899 500 500 19.786 \n", "2G_0 19928 500 500 19.936 \n", "2H_0 20110 500 500 20.062 \n", "3I_0 20078 500 500 20.022 \n", "3J_0 19965 500 500 19.832 \n", "3K_0 19846 500 500 19.800 \n", "3L_0 20025 500 500 20.046 \n", "\n", " avg_depth_mj avg_depth_stat sd_depth_total sd_depth_mj \\\n", "1A_0 19.956 19.956 2.694079 2.694079 \n", "1B_0 20.144 20.144 2.854341 2.854341 \n", "1C_0 20.108 20.108 2.746695 2.746695 \n", "1D_0 19.914 19.914 2.861923 2.861923 \n", "2E_0 19.994 19.994 2.828067 2.828067 \n", "2F_0 19.786 19.786 2.879619 2.879619 \n", "2G_0 19.936 19.936 2.994646 2.994646 \n", "2H_0 20.062 20.062 2.876483 2.876483 \n", "3I_0 20.022 20.022 2.789537 2.789537 \n", "3J_0 19.832 19.832 2.924342 2.924342 \n", "3K_0 19.800 19.800 2.802856 2.802856 \n", "3L_0 20.046 20.046 2.799979 2.799979 \n", "\n", " sd_depth_stat filtered_bad_align \n", "1A_0 2.694079 0 \n", "1B_0 2.854341 0 \n", "1C_0 2.746695 0 \n", "1D_0 2.861923 0 \n", "2E_0 2.828067 0 \n", "2F_0 2.879619 0 \n", "2G_0 2.994646 0 \n", "2H_0 2.876483 0 \n", "3I_0 2.789537 0 \n", "3J_0 2.924342 0 \n", "3K_0 2.802856 0 \n", "3L_0 2.799979 0 \n", "s4 : hetero_est error_est\n", "1A_0 0.001792 0.000785\n", "1B_0 0.001959 0.000769\n", "1C_0 0.001835 0.000715\n", "1D_0 0.001540 0.000714\n", "2E_0 0.002120 0.000742\n", "2F_0 0.001826 0.000757\n", "2G_0 0.002056 0.000727\n", "2H_0 0.002176 0.000683\n", "3I_0 0.001933 0.000706\n", "3J_0 0.001724 0.000726\n", "3K_0 0.001869 0.000755\n", "3L_0 0.002099 0.000723\n", "s5 : clusters_total filtered_by_depth filtered_by_maxH filtered_by_maxN \\\n", "1A_0 500 0 0 0 \n", "1B_0 500 0 0 0 \n", "1C_0 500 0 0 0 \n", "1D_0 500 0 0 0 \n", "2E_0 500 0 0 0 \n", "2F_0 500 0 0 0 \n", "2G_0 500 0 0 0 \n", "2H_0 500 0 0 0 \n", "3I_0 500 0 0 0 \n", "3J_0 500 0 0 0 \n", "3K_0 500 0 0 0 \n", "3L_0 500 0 0 0 \n", "\n", " reads_consens nsites nhetero heterozygosity \n", "1A_0 500 97500 165 0.001692 \n", "1B_0 500 97500 180 0.001846 \n", "1C_0 500 97500 168 0.001723 \n", "1D_0 500 97500 142 0.001456 \n", "2E_0 500 97500 196 0.002010 \n", "2F_0 500 97500 169 0.001733 \n", "2G_0 500 97500 188 0.001928 \n", "2H_0 500 97500 201 0.002062 \n", "3I_0 500 97499 178 0.001826 \n", "3J_0 500 97500 156 0.001600 \n", "3K_0 500 97500 175 0.001795 \n", "3L_0 500 97500 192 0.001969 \n", "s7_filters : total_filters applied_order retained_loci\n", "total_prefiltered_loci 500 0 500\n", "filtered_by_rm_duplicates 0 0 500\n", "filtered_by_max_indels 0 0 500\n", "filtered_by_max_snps 0 0 500\n", "filtered_by_max_shared_het 0 0 500\n", "filtered_by_min_sample 0 0 500\n", "filtered_by_max_alleles 2 2 498\n", "total_filtered_loci 498 0 498\n", "s7_loci : locus_coverage sum_coverage\n", "1 0 0\n", "2 0 0\n", "3 0 0\n", "4 0 0\n", "5 0 0\n", "6 0 0\n", "7 0 0\n", "8 0 0\n", "9 0 0\n", "10 0 0\n", "11 0 0\n", "12 498 498\n", "s7_samples : sample_coverage\n", "1A_0 498\n", "1B_0 498\n", "1C_0 498\n", "1D_0 498\n", "2E_0 498\n", "2F_0 498\n", "2G_0 498\n", "2H_0 498\n", "3I_0 498\n", "3J_0 498\n", "3K_0 498\n", "3L_0 498\n", "s7_snps : var sum_var pis sum_pis\n", "0 0 0 55 0\n", "1 0 0 132 132\n", "2 1 2 122 376\n", "3 15 47 88 640\n", "4 12 95 55 860\n", "5 32 255 25 985\n", ".. ... ... ... ...\n", "16 8 4322 0 1118\n", "17 1 4339 0 1118\n", "18 1 4357 0 1118\n", "19 0 4357 0 1118\n", "20 1 4377 0 1118\n", "21 1 4398 0 1118\n", "\n", "[22 rows x 4 columns]\n" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data2.stats_dfs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### pairgbs w/ merged reads example" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "## create an assembly for denovo\n", "data1 = ip.Assembly(\"denovo\")\n", "data1.set_params(1, \"tests4\")\n", "data1.set_params(2, os.path.join(DIR, \"pairgbs_wmerge_example_*.fastq.gz\"))\n", "data1.set_params(3, os.path.join(DIR, \"pairgbs_wmerge_example_barcodes.txt\"))\n", "data1.set_params(7, \"pairgbs\")\n", "data1.set_params(8, (\"TGCAG\", \"TGCAG\"))\n", "\n", "##...\n", "\n", "\n", "## branch into an assembly for denovo+reference\n", "data3 = data1.branch(\"denovo-plus\")\n", "data3.set_params(5, 'denovo+reference')\n", "\n", "## branch into an assembly for reference\n", "data4 = data1.branch(\"denovo-minus\")\n", "data4.set_params(5, 'denovo-reference')\n", "\n", "## assemble both\n", "data1.run(\"1234567\", force=True)\n", "data2.run(\"1234567\", force=True)\n", "data3.run(\"1234567\", force=True)\n", "data4.run(\"1234567\", force=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "## check results\n", "assert all(data1.stats.clusters_hidepth == 1000)\n", "assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000\n", "\n", "assert all(data2.stats.clusters_hidepth == N_INSERTS)\n", "assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS\n", "\n", "assert all(data3.stats.clusters_hidepth == 1000)\n", "assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000\n", "\n", "assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)\n", "assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create zipped archive of sim data" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "./ipsimdata/\n", "./ipsimdata/pairgbs_example_R2_.fastq.gz\n", "./ipsimdata/pairgbs_wmerge_example_barcodes.txt\n", "./ipsimdata/rad_example_genome.fa\n", "./ipsimdata/pairddrad_example_genome.fa\n", "./ipsimdata/pairgbs_example_R1_.fastq.gz\n", "./ipsimdata/pairgbs_wmerge_example_R2_.fastq.gz\n", "./ipsimdata/rad_example_genome.fa.fai\n", "./ipsimdata/pairddrad_example_R2_.fastq.gz\n", "./ipsimdata/pairddrad_example_genome.fa.sma\n", "./ipsimdata/pairddrad_example_genome.fa.fai\n", "./ipsimdata/pairgbs_wmerge_example_genome.fa\n", "./ipsimdata/pairddrad_wmerge_example_genome.fa\n", "./ipsimdata/pairddrad_example_genome.fa.smi\n", "./ipsimdata/pairgbs_wmerge_example_R1_.fastq.gz\n", "./ipsimdata/rad_example_genome.fa.smi\n", "./ipsimdata/gbs_example_barcodes.txt\n", "./ipsimdata/pairgbs_example_barcodes.txt\n", "./ipsimdata/pairddrad_example_R1_.fastq.gz\n", "./ipsimdata/pairddrad_wmerge_example_barcodes.txt\n", "./ipsimdata/rad_example_barcodes.txt\n", "./ipsimdata/pairddrad_wmerge_example_R1_.fastq.gz\n", "./ipsimdata/pairddrad_wmerge_example_R2_.fastq.gz\n", "./ipsimdata/gbs_example_R1_.fastq.gz\n", "./ipsimdata/pairddrad_example_barcodes.txt\n", "./ipsimdata/rad_example_genome.fa.sma\n", "./ipsimdata/rad_example_R1_.fastq.gz\n", "./ipsimdata/gbs_example_genome.fa\n" ] } ], "source": [ "%%bash -s $DIR\n", "## compressed dir/ w/ all data files\n", "tar -zcvf ipsimdata.tar.gz $1 " ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 0 }