{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Simulated RAD-seq data for ipyrad testing\n",
    "The simulations software simrrls is available at [github.com/dereneaton/simrrls](github.com/dereneaton/simrrls). \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "simrrls v.0.0.13\n",
      "ipyrad v.0.3.42\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import glob\n",
    "import gzip\n",
    "import itertools\n",
    "import numpy as np\n",
    "import simrrls\n",
    "import ipyrad as ip\n",
    "\n",
    "print \"simrrls v.{}\".format(simrrls.__version__)\n",
    "print \"ipyrad v.{}\".format(ip.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Organization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## the dir to write data to\n",
    "DIR = \"./ipsimdata/\"\n",
    "\n",
    "## if it doesn't exist make it\n",
    "if not os.path.exists(DIR):\n",
    "    os.makedirs(DIR)\n",
    "    \n",
    "## if theres anything in it, remove it\n",
    "for oldfile in glob.glob(os.path.join(DIR, \"*\")):\n",
    "    os.remove(oldfile)\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Simulate the data with simrrls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\tmin insert size allows read overlaps/adapter sequences \n",
      "\tmin insert size allows read overlaps/adapter sequences \n"
     ]
    }
   ],
   "source": [
    "%%bash -s $DIR\n",
    "\n",
    "## simulate rad_example (includes indels)\n",
    "simrrls -o $1/rad_example -f rad -dm 10 -ds 2 -I 0.05 -L 1000\n",
    "\n",
    "## simulate larger rad data set\n",
    "#simrrls -o $1/rad_large -f rad -dm 10 -ds 2 -I 0.01 -L 10000\n",
    "\n",
    "## simulate pairddrad_example\n",
    "simrrls -o $1/gbs_example -f gbs -dm 10 -ds 2 -I 0.05 -L 1000\n",
    "\n",
    "## simulate pairddrad_example\n",
    "simrrls -o $1/pairddrad_example -f pairddrad -dm 10 -ds 2 -L 1000\n",
    "\n",
    "## simulate pairgbs_example\n",
    "simrrls -o $1/pairgbs_example -f pairgbs -dm 10 -ds 2 -L 1000\n",
    "\n",
    "## simulate pairddrad_wmerge_example \n",
    "simrrls -o $1/pairddrad_wmerge_example -f pairddrad -dm 10 -ds 2 \\\n",
    "                                         -i1 -50 -i2 100 -L 1000\n",
    "\n",
    "## simulate pairgbs_wmerge_example (mixture of merged and non-merged reads)\n",
    "simrrls -o $1/pairgbs_wmerge_example -f pairgbs -dm 10 -ds 2 \\\n",
    "                                       -i1 -50 -i2 100 -L 1000"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Functions to insert reads into simulated genome"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import itertools\n",
    "import gzip\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## Utility function\n",
    "def revcomp(sequence):\n",
    "    \"returns reverse complement of a string\"\n",
    "    sequence = sequence[::-1].strip()\\\n",
    "                             .replace(\"A\", \"t\")\\\n",
    "                             .replace(\"T\", \"a\")\\\n",
    "                             .replace(\"C\", \"g\")\\\n",
    "                             .replace(\"G\", \"c\").upper()\n",
    "    return sequence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def RAD_to_genome(R1s, R2s, n_inserts, insert_low, insert_high, out_chr):\n",
    "    \"\"\" \n",
    "    Create a simulated genome file with RAD data interspersed.\n",
    "    \"\"\"\n",
    "    \n",
    "    ## start genome fasta sequence with 5000 random bp\n",
    "    genome = np.random.choice(list(\"ATGC\"), 5000)\n",
    "    \n",
    "    ## read in the RAD data files\n",
    "    dat1 = gzip.open(R1s, 'r')\n",
    "    qiter1 = itertools.izip(*[iter(dat1)]*4)\n",
    "    if R2s:\n",
    "        dat2 = gzip.open(R2s, 'r')\n",
    "        qiter2 = itertools.izip(*[iter(dat2)]*4)\n",
    "    else:\n",
    "        qiter2 = itertools.izip(*[iter(str, 1)]*4)\n",
    "       \n",
    "    ## sample unique reads from rads\n",
    "    uniqs = []\n",
    "    locid = 0\n",
    "    while len(uniqs) < n_inserts:\n",
    "        ## grab a read and get locus id\n",
    "        qrt1 = qiter1.next()\n",
    "        qrt2 = qiter2.next()\n",
    "        iloc = []\n",
    "        ilocid = int(qrt1[0].split(\"_\")[1][5:])\n",
    "        \n",
    "        ## go until end of locus copies\n",
    "        while ilocid == locid:\n",
    "            iloc.append([qrt1[1].strip(), qrt2[1].strip()])\n",
    "            qrt1 = qiter1.next()\n",
    "            qrt2 = qiter2.next()\n",
    "            ilocid = int(qrt1[0].split(\"_\")[1][5:])\n",
    "        \n",
    "        ## sample one read\n",
    "        uniqs.append(iloc[0])\n",
    "        locid += 1\n",
    "        \n",
    "    ## insert RADs into genome\n",
    "    for ins in range(n_inserts):   \n",
    "        ## get read, we leave the barcode on cuz it won't hurt\n",
    "        r1 = np.array(list(uniqs[ins][0]))\n",
    "        r2 = np.array(list(revcomp(uniqs[ins][1])))\n",
    "        \n",
    "        ## add R1 to genome\n",
    "        genome = np.concatenate((genome, r1), axis=0)\n",
    "        if qrt2[0]:\n",
    "            ## add insert size\n",
    "            howlong = np.random.uniform(insert_low, insert_high)\n",
    "            chunk = np.random.choice(list(\"ATGC\"), int(howlong))\n",
    "            #print howlong\n",
    "            genome = np.concatenate((genome, chunk), axis=0)\n",
    "            ## add read2\n",
    "            genome = np.concatenate((genome, r2), axis=0)\n",
    "\n",
    "        ## add spacer between loci\n",
    "        genome = np.concatenate((genome, np.random.choice(list(\"ATGC\"), 5000)), axis=0)\n",
    "\n",
    "    print(\"input genome is {} bp\".format(genome.shape[0]))\n",
    "    print(\"inserted {} loci into genome\".format(n_inserts))\n",
    "\n",
    "    with open(out_chr, \"w\") as fasta:\n",
    "        header = \">MT dna:chromosome chromosome:test:MT:1:{}:1 REF\\n\"\\\n",
    "                 .format(genome.shape[0])\n",
    "        fasta.write(header)\n",
    "        outseq = list(genome)\n",
    "        for i in range(0, len(outseq), 70):\n",
    "            fasta.write(\"{}\\n\".format(\"\".join(outseq[i:i+70])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Make 'genome' data files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## Meta args\n",
    "N_INSERTS = 500\n",
    "INSERT_LOW = 250\n",
    "INSERT_HIGH = 300"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input genome is 2555000 bp\n",
      "inserted 500 loci into genome\n"
     ]
    }
   ],
   "source": [
    "## SE RAD data\n",
    "DATA_R1 = os.path.join(DIR, \"rad_example_R1_.fastq.gz\")\n",
    "OUTPUT_CHR = os.path.join(DIR, \"rad_example_genome.fa\")\n",
    "big = RAD_to_genome(DATA_R1, 0, N_INSERTS, \n",
    "                    INSERT_LOW, INSERT_HIGH,\n",
    "                    OUTPUT_CHR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input genome is 2555000 bp\n",
      "inserted 500 loci into genome\n"
     ]
    }
   ],
   "source": [
    "## SE GBS data\n",
    "DATA_R1 = os.path.join(DIR, \"gbs_example_R1_.fastq.gz\")\n",
    "OUTPUT_CHR = os.path.join(DIR, \"gbs_example_genome.fa\")\n",
    "RAD_to_genome(DATA_R1, 0, N_INSERTS,\n",
    "              INSERT_LOW, INSERT_HIGH, \n",
    "              OUTPUT_CHR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input genome is 2742124 bp\n",
      "inserted 500 loci into genome\n"
     ]
    }
   ],
   "source": [
    "## PAIR ddRAD data \n",
    "DATA_R1 = os.path.join(DIR, \"pairddrad_example_R1_.fastq.gz\")\n",
    "DATA_R2 = os.path.join(DIR, \"pairddrad_example_R2_.fastq.gz\")\n",
    "OUTPUT_CHR = os.path.join(DIR, \"pairddrad_example_genome.fa\")\n",
    "RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, \n",
    "              INSERT_LOW, INSERT_HIGH,\n",
    "              OUTPUT_CHR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input genome is 2742116 bp\n",
      "inserted 500 loci into genome\n"
     ]
    }
   ],
   "source": [
    "## PAIR ddRAD data \n",
    "DATA_R1 = os.path.join(DIR, \"pairddrad_wmerge_example_R1_.fastq.gz\")\n",
    "DATA_R2 = os.path.join(DIR, \"pairddrad_wmerge_example_R2_.fastq.gz\")\n",
    "OUTPUT_CHR = os.path.join(DIR, \"pairddrad_wmerge_example_genome.fa\")\n",
    "RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, \n",
    "              INSERT_LOW, INSERT_HIGH,  \n",
    "              OUTPUT_CHR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input genome is 2743007 bp\n",
      "inserted 500 loci into genome\n"
     ]
    }
   ],
   "source": [
    "## PAIR GBS data\n",
    "DATA_R1 = os.path.join(DIR, \"pairgbs_wmerge_example_R1_.fastq.gz\")\n",
    "DATA_R2 = os.path.join(DIR, \"pairgbs_wmerge_example_R2_.fastq.gz\")\n",
    "OUTPUT_CHR = os.path.join(DIR, \"pairgbs_wmerge_example_genome.fa\")\n",
    "RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, \n",
    "              INSERT_LOW, INSERT_HIGH, \n",
    "              OUTPUT_CHR)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Run tests in ipyrad\n",
    "### rad_example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  New Assembly: denovo\n",
      "\n",
      "  Assembly: denovo\n",
      "\n",
      "    [force] overwriting fastq files previously *created by ipyrad* in:\n",
      "    /home/deren/Documents/ipyrad/tests/tests1/denovo_fastqs\n",
      "    This does not affect your *original/raw data files*\n",
      "  [####################] 100%  chunking large files  | 0:00:00 \n",
      "  [####################] 100%  sorting reads         | 0:00:03 \n",
      "  [####################] 100%  writing/compressing   | 0:00:00 \n",
      "\n",
      "  [####################] 100%  processing reads      | 0:00:04 \n",
      "\n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  clustering            | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:00:44 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "\n",
      "  [####################] 100%  inferring [H, E]      | 0:00:55 \n",
      "\n",
      "  [####################] 100%  consensus calling     | 0:00:26 \n",
      "  [####################] 100%  concat/shuffle input  | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:01 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:08 \n",
      "  [####################] 100%  database indels       | 0:00:00 \n",
      "  [####################] 100%  indexing clusters     | 0:00:01 \n",
      "  [####################] 100%  building database     | 0:00:00 \n",
      "  [####################] 100%  filtering loci        | 0:00:01 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:03 \n",
      "  [####################] 100%  writing vcf file      | 0:00:01 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/tests1/denovo_outfiles\n",
      "\n",
      "  Assembly: reference\n",
      "\n",
      "    [force] overwriting fastq files previously *created by ipyrad* in:\n",
      "    /home/deren/Documents/ipyrad/tests/tests1/reference_fastqs\n",
      "    This does not affect your *original/raw data files*\n",
      "  [####################] 100%  chunking large files  | 0:00:00 \n",
      "  [####################] 100%  sorting reads         | 0:00:03 \n",
      "  [####################] 100%  writing/compressing   | 0:00:00 \n",
      "\n",
      "  [####################] 100%  processing reads      | 0:00:04 \n",
      "\n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  mapping               | 0:00:45 \n",
      "  [####################] 100%  finalize mapping      | 0:00:14 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:00:31 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "\n",
      "  [####################] 100%  inferring [H, E]      | 0:00:41 \n",
      "\n",
      "  [####################] 100%  consensus calling     | 0:00:15 \n",
      "  [####################] 100%  concat/shuffle input  | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:04 \n",
      "  [####################] 100%  database indels       | 0:00:00 \n",
      "  [####################] 100%  indexing clusters     | 0:00:00 \n",
      "  [####################] 100%  building database     | 0:00:00 \n",
      "  [####################] 100%  filtering loci        | 0:00:01 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:02 \n",
      "  [####################] 100%  writing vcf file      | 0:00:01 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/tests1/reference_outfiles\n",
      "\n",
      "  Assembly: denovo-plus\n",
      "\n",
      "    [force] overwriting fastq files previously *created by ipyrad* in:\n",
      "    /home/deren/Documents/ipyrad/tests/tests1/denovo-plus_fastqs\n",
      "    This does not affect your *original/raw data files*\n",
      "  [####################] 100%  chunking large files  | 0:00:00 \n",
      "  [####################] 100%  sorting reads         | 0:00:03 \n",
      "  [####################] 100%  writing/compressing   | 0:00:00 \n",
      "\n",
      "  [####################] 100%  processing reads      | 0:00:04 \n",
      "\n",
      "    Force reindexing of reference sequence\n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  mapping               | 0:00:46 \n",
      "  [####################] 100%  clustering            | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  finalize mapping      | 0:00:15 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:00:55 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "\n",
      "  [####################] 100%  inferring [H, E]      | 0:00:59 \n",
      "\n",
      "  [####################] 100%  consensus calling     | 0:00:29 \n",
      "  [####################] 100%  concat/shuffle input  | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:01 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:10 \n",
      "  [####################] 100%  database indels       | 0:00:00 \n",
      "  [####################] 100%  indexing clusters     | 0:00:01 \n",
      "  [####################] 100%  building database     | 0:00:00 \n",
      "  [####################] 100%  filtering loci        | 0:00:01 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:03 \n",
      "  [####################] 100%  writing vcf file      | 0:00:01 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/tests1/denovo-plus_outfiles\n",
      "\n",
      "  Assembly: denovo-minus\n",
      "\n",
      "    [force] overwriting fastq files previously *created by ipyrad* in:\n",
      "    /home/deren/Documents/ipyrad/tests/tests1/denovo-minus_fastqs\n",
      "    This does not affect your *original/raw data files*\n",
      "  [####################] 100%  chunking large files  | 0:00:00 \n",
      "  [####################] 100%  sorting reads         | 0:00:03 \n",
      "  [####################] 100%  writing/compressing   | 0:00:00 \n",
      "\n",
      "  [####################] 100%  processing reads      | 0:00:04 \n",
      "\n",
      "    Force reindexing of reference sequence\n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  mapping               | 0:00:12 \n",
      "  [####################] 100%  clustering            | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:00:26 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "\n",
      "  [####################] 100%  inferring [H, E]      | 0:00:42 \n",
      "\n",
      "  [####################] 100%  consensus calling     | 0:00:16 \n",
      "  [####################] 100%  concat/shuffle input  | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:05 \n",
      "  [####################] 100%  database indels       | 0:00:00 \n",
      "  [####################] 100%  indexing clusters     | 0:00:01 \n",
      "  [####################] 100%  building database     | 0:00:00 \n",
      "  [####################] 100%  filtering loci        | 0:00:01 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:02 \n",
      "  [####################] 100%  writing vcf file      | 0:00:01 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/tests1/denovo-minus_outfiles\n"
     ]
    }
   ],
   "source": [
    "## create an assembly for denovo\n",
    "data1 = ip.Assembly(\"denovo\")\n",
    "data1.set_params(1, \"tests1\")\n",
    "data1.set_params(2, os.path.join(DIR, 'rad_example_R1_.fastq.gz'))\n",
    "data1.set_params(3, os.path.join(DIR, 'rad_example_barcodes.txt'))\n",
    "data1.set_params(\"max_alleles_consens\", 4) ## raise this for now \n",
    "\n",
    "## branch into an assembly for reference\n",
    "data2 = data1.branch(\"reference\")\n",
    "data2.set_params(5, 'reference')\n",
    "data2.set_params(6, os.path.join(DIR, 'rad_example_genome.fa'))\n",
    "\n",
    "## branch into an assembly for denovo+reference\n",
    "data3 = data2.branch(\"denovo-plus\")\n",
    "data3.set_params(5, 'denovo+reference')\n",
    "\n",
    "## branch into an assembly for reference\n",
    "data4 = data2.branch(\"denovo-minus\")\n",
    "data4.set_params(5, 'denovo-reference')\n",
    "\n",
    "## assemble both\n",
    "data1.run(\"1234567\", force=True)\n",
    "data2.run(\"1234567\", force=True)\n",
    "data3.run(\"1234567\", force=True)\n",
    "data4.run(\"1234567\", force=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "NLOCI = 1000\n",
    "\n",
    "## check results\n",
    "assert all(data1.stats.clusters_hidepth == NLOCI)\n",
    "assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI\n",
    "\n",
    "assert all(data2.stats.clusters_hidepth == N_INSERTS)\n",
    "assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS\n",
    "\n",
    "assert all(data3.stats.clusters_hidepth == NLOCI)\n",
    "assert data3.stats_dfs.s7_loci.sum_coverage.max() == NLOCI\n",
    "\n",
    "assert all(data4.stats.clusters_hidepth == NLOCI-N_INSERTS)\n",
    "assert data4.stats_dfs.s7_loci.sum_coverage.max() == NLOCI-N_INSERTS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### gbs example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  New Assembly: denovo\n",
      "\n",
      "  Assembly: denovo\n",
      "\n",
      "  [####################] 100%  chunking large files  | 0:00:00 \n",
      "  [####################] 100%  sorting reads         | 0:00:03 \n",
      "  [####################] 100%  writing/compressing   | 0:00:00 \n",
      "\n",
      "  [####################] 100%  processing reads      | 0:00:33 \n",
      "\n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  clustering            | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:00:51 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "\n",
      "  [####################] 100%  inferring [H, E]      | 0:00:51 \n",
      "\n",
      "  [####################] 100%  consensus calling     | 0:00:24 \n",
      "  [####################] 100%  concat/shuffle input  | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:08 \n",
      "  [####################] 100%  indexing clusters     | 0:00:03 \n",
      "  [####################] 100%  building database     | 0:00:05 \n",
      "  [####################] 100%  filtering loci        | 0:00:04 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:14 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/tests2/denovo_outfiles\n",
      "\n",
      "  Assembly: reference\n",
      "\n",
      "  [####################] 100%  chunking large files  | 0:00:00 \n",
      "  [####################] 100%  sorting reads         | 0:00:03 \n",
      "  [####################] 100%  writing/compressing   | 0:00:00 \n",
      "\n",
      "  [####################] 100%  processing reads      | 0:00:32 \n",
      "\n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  mapping               | 0:00:55 \n",
      "  [####################] 100%  finalize mapping      | 0:00:20 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:00:37 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "\n",
      "  [####################] 100%  inferring [H, E]      | 0:00:37 \n",
      "\n",
      "  [####################] 100%  consensus calling     | 0:00:14 \n",
      "  [####################] 100%  concat/shuffle input  | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:04 \n",
      "  [####################] 100%  indexing clusters     | 0:00:03 \n",
      "  [####################] 100%  building database     | 0:00:02 \n",
      "  [####################] 100%  filtering loci        | 0:00:00 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:07 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/tests2/reference_outfiles\n",
      "\n",
      "  Assembly: denovo-plus\n",
      "\n",
      "  [####################] 100%  chunking large files  | 0:00:00 \n",
      "  [####################] 100%  sorting reads         | 0:00:03 \n",
      "  [####################] 100%  writing/compressing   | 0:00:00 \n",
      "\n",
      "  [####################] 100%  processing reads      | 0:00:34 \n",
      "\n",
      "    Force reindexing of reference sequence\n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  mapping               | 0:00:54 \n",
      "  [####################] 100%  clustering            | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  finalize mapping      | 0:00:21 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:01:06 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "\n",
      "  [####################] 100%  inferring [H, E]      | 0:00:55 \n",
      "\n",
      "  [####################] 100%  consensus calling     | 0:00:26 \n",
      "  [####################] 100%  concat/shuffle input  | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:10 \n",
      "  [####################] 100%  indexing clusters     | 0:00:03 \n",
      "  [####################] 100%  building database     | 0:00:05 \n",
      "  [####################] 100%  filtering loci        | 0:00:00 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:14 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/tests2/denovo-plus_outfiles\n",
      "\n",
      "  Assembly: denovo-minus\n",
      "\n",
      "  [####################] 100%  chunking large files  | 0:00:00 \n",
      "  [####################] 100%  sorting reads         | 0:00:03 \n",
      "  [####################] 100%  writing/compressing   | 0:00:00 \n",
      "\n",
      "  [####################] 100%  processing reads      | 0:00:35 \n",
      "\n",
      "    Force reindexing of reference sequence\n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  mapping               | 0:00:11 \n",
      "  [####################] 100%  clustering            | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:00:27 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "\n",
      "  [####################] 100%  inferring [H, E]      | 0:00:37 \n",
      "\n",
      "  [####################] 100%  consensus calling     | 0:00:15 \n",
      "  [####################] 100%  concat/shuffle input  | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:04 \n",
      "  [####################] 100%  indexing clusters     | 0:00:03 \n",
      "  [####################] 100%  building database     | 0:00:02 \n",
      "  [####################] 100%  filtering loci        | 0:00:04 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:07 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/tests2/denovo-minus_outfiles\n"
     ]
    }
   ],
   "source": [
    "## create an assembly for denovo\n",
    "data1 = ip.Assembly(\"denovo\")\n",
    "data1.set_params(1, \"tests2\")\n",
    "data1.set_params(2, os.path.join(DIR, \"gbs_example_R1_.fastq.gz\"))\n",
    "data1.set_params(3, os.path.join(DIR, \"gbs_example_barcodes.txt\"))\n",
    "data1.set_params(\"max_alleles_consens\", 4) ## raise this for now \n",
    "\n",
    "## branch into an assembly for reference\n",
    "data2 = data1.branch(\"reference\")\n",
    "data2.set_params(5, \"reference\")\n",
    "data2.set_params(6, os.path.join(DIR, \"gbs_example_genome.fa\"))\n",
    "\n",
    "## branch into an assembly for denovo+reference\n",
    "data3 = data2.branch(\"denovo-plus\")\n",
    "data3.set_params(5, 'denovo+reference')\n",
    "\n",
    "## branch into an assembly for reference\n",
    "data4 = data2.branch(\"denovo-minus\")\n",
    "data4.set_params(5, 'denovo-reference')\n",
    "\n",
    "## assemble both\n",
    "data1.run(\"1234567\", force=True)\n",
    "data2.run(\"1234567\", force=True)\n",
    "data3.run(\"1234567\", force=True)\n",
    "data4.run(\"1234567\", force=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "## check results\n",
    "assert all(data1.stats.clusters_hidepth == 1000)\n",
    "assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000\n",
    "\n",
    "assert all(data2.stats.clusters_hidepth == N_INSERTS)\n",
    "assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS\n",
    "\n",
    "assert all(data3.stats.clusters_hidepth == 1000)\n",
    "assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000\n",
    "\n",
    "assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)\n",
    "assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### pairddrad without merged reads example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  New Assembly: denovo\n",
      "\n",
      "  Assembly: denovo\n",
      "\n",
      "    [force] overwriting fastq files previously *created by ipyrad* in:\n",
      "    /home/deren/Documents/ipyrad/tests/tests3/denovo_fastqs\n",
      "    This does not affect your *original/raw data files*\n",
      "  [####################] 100%  chunking large files  | 0:00:00 \n",
      "  [####################] 100%  sorting reads         | 0:00:05 \n",
      "  [####################] 100%  writing/compressing   | 0:00:01 \n",
      "\n",
      "  [####################] 100%  processing reads      | 0:00:07 \n",
      "\n",
      "  [####################] 100%  dereplicating         | 0:00:01 \n",
      "  [####################] 100%  clustering            | 0:00:06 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:02:00 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "\n",
      "  [####################] 100%  inferring [H, E]      | 0:01:14 \n",
      "\n",
      "  [####################] 100%  consensus calling     | 0:00:51 \n",
      "  [####################] 100%  concat/shuffle input  | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:02 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:16 \n",
      "  [####################] 100%  database indels       | 0:00:00 \n",
      "  [####################] 100%  indexing clusters     | 0:00:01 \n",
      "  [####################] 100%  building database     | 0:00:00 \n",
      "  [####################] 100%  filtering loci        | 0:00:01 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:03 \n",
      "  [####################] 100%  writing vcf file      | 0:00:01 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/tests3/denovo_outfiles\n",
      "\n",
      "  Assembly: reference\n",
      "\n",
      "    [force] overwriting fastq files previously *created by ipyrad* in:\n",
      "    /home/deren/Documents/ipyrad/tests/tests3/reference_fastqs\n",
      "    This does not affect your *original/raw data files*\n",
      "  [####################] 100%  chunking large files  | 0:00:00 \n",
      "  [####################] 100%  sorting reads         | 0:00:05 \n",
      "  [####################] 100%  writing/compressing   | 0:00:01 \n",
      "\n",
      "  [####################] 100%  processing reads      | 0:00:06 \n",
      "\n",
      "    Force reindexing of reference sequence\n",
      "  [####################] 100%  dereplicating         | 0:00:01 \n",
      "  [####################] 100%  mapping               | 0:02:04 \n",
      "  [####################] 100%  finalize mapping      | 0:00:36 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:01:03 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "\n",
      "  [####################] 100%  inferring [H, E]      | 0:00:55 \n",
      "\n",
      "  [####################] 100%  consensus calling     | 0:00:29 \n",
      "  [####################] 100%  concat/shuffle input  | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:01 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:09 \n",
      "  [####################] 100%  database indels       | 0:00:00 \n",
      "  [####################] 100%  indexing clusters     | 0:00:01 \n",
      "  [####################] 100%  building database     | 0:00:00 \n",
      "  [####################] 100%  filtering loci        | 0:00:01 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:02 \n",
      "  [####################] 100%  writing vcf file      | 0:00:01 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/tests3/reference_outfiles\n",
      "\n",
      "  Assembly: denovo-plus\n",
      "\n",
      "    [force] overwriting fastq files previously *created by ipyrad* in:\n",
      "    /home/deren/Documents/ipyrad/tests/tests3/denovo-plus_fastqs\n",
      "    This does not affect your *original/raw data files*\n",
      "  [####################] 100%  chunking large files  | 0:00:00 \n",
      "  [####################] 100%  sorting reads         | 0:00:06 \n",
      "  [####################] 100%  writing/compressing   | 0:00:01 \n",
      "\n",
      "  [####################] 100%  processing reads      | 0:00:07 \n",
      "\n",
      "    Force reindexing of reference sequence\n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  mapping               | 0:02:14 \n",
      "  [####################] 100%  clustering            | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  finalize mapping      | 0:00:37 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:02:15 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "\n",
      "  [####################] 100%  inferring [H, E]      | 0:01:32 \n",
      "\n",
      "  [####################] 100%  consensus calling     | 0:00:55 \n",
      "  [####################] 100%  concat/shuffle input  | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:02 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:17 \n",
      "  [####################] 100%  database indels       | 0:00:00 \n",
      "  [####################] 100%  indexing clusters     | 0:00:01 \n",
      "  [####################] 100%  building database     | 0:00:00 \n",
      "  [####################] 100%  filtering loci        | 0:00:01 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:03 \n",
      "  [####################] 100%  writing vcf file      | 0:00:01 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/tests3/denovo-plus_outfiles\n",
      "\n",
      "  Assembly: denovo-minus\n",
      "\n",
      "    [force] overwriting fastq files previously *created by ipyrad* in:\n",
      "    /home/deren/Documents/ipyrad/tests/tests3/denovo-minus_fastqs\n",
      "    This does not affect your *original/raw data files*\n",
      "  [####################] 100%  chunking large files  | 0:00:00 \n",
      "  [####################] 100%  sorting reads         | 0:00:05 \n",
      "  [####################] 100%  writing/compressing   | 0:00:01 \n",
      "\n",
      "  [####################] 100%  processing reads      | 0:00:07 \n",
      "\n",
      "    Force reindexing of reference sequence\n",
      "  [####################] 100%  dereplicating         | 0:00:01 \n",
      "  [####################] 100%  mapping               | 0:00:52 \n",
      "  [####################] 100%  clustering            | 0:00:00 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:01:03 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "\n",
      "  [####################] 100%  inferring [H, E]      | 0:01:02 \n",
      "\n",
      "  [####################] 100%  consensus calling     | 0:00:30 \n",
      "  [####################] 100%  concat/shuffle input  | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:01 \n",
      "  [####################] 100%  building clusters     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:09 \n",
      "  [####################] 100%  database indels       | 0:00:00 \n",
      "  [####################] 100%  indexing clusters     | 0:00:01 \n",
      "  [####################] 100%  building database     | 0:00:00 \n",
      "  [####################] 100%  filtering loci        | 0:00:01 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:02 \n",
      "  [####################] 100%  writing vcf file      | 0:00:01 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/tests3/denovo-minus_outfiles\n"
     ]
    }
   ],
   "source": [
    "## create an assembly for denovo\n",
    "data1 = ip.Assembly(\"denovo\")\n",
    "data1.set_params(1, \"tests3\")\n",
    "data1.set_params(2, os.path.join(DIR, \"pairddrad_example_*.fastq.gz\"))\n",
    "data1.set_params(3, os.path.join(DIR, \"pairddrad_example_barcodes.txt\"))\n",
    "data1.set_params(7, \"pairddrad\")\n",
    "data1.set_params(8, (\"TGCAG\", \"CCGG\"))\n",
    "\n",
    "## branch into an assembly for reference\n",
    "data2 = data1.branch(\"reference\")\n",
    "data2.set_params(5, \"reference\")\n",
    "data2.set_params(6, os.path.join(DIR, \"pairddrad_example_genome.fa\"))\n",
    "\n",
    "## branch into an assembly for denovo+reference\n",
    "data3 = data2.branch(\"denovo-plus\")\n",
    "data3.set_params(5, 'denovo+reference')\n",
    "\n",
    "## branch into an assembly for reference\n",
    "data4 = data2.branch(\"denovo-minus\")\n",
    "data4.set_params(5, 'denovo-reference')\n",
    "\n",
    "## assemble both\n",
    "data1.run(\"1234567\", force=True)\n",
    "data2.run(\"1234567\", force=True)\n",
    "data3.run(\"1234567\", force=True)\n",
    "data4.run(\"1234567\", force=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "AssertionError",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-54-cce3ab63dc2d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclusters_hidepth\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mN_INSERTS\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mdata2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats_dfs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0ms7_loci\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum_coverage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mN_INSERTS\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclusters_hidepth\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAssertionError\u001b[0m: "
     ]
    }
   ],
   "source": [
    "## check results\n",
    "assert all(data1.stats.clusters_hidepth == 1000)\n",
    "assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000\n",
    "\n",
    "assert all(data2.stats.clusters_hidepth == N_INSERTS)\n",
    "assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS\n",
    "\n",
    "assert all(data3.stats.clusters_hidepth == 1000)\n",
    "assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000\n",
    "\n",
    "assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)\n",
    "assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "s1 :       reads_raw\n",
       "1A_0      19835\n",
       "1B_0      20071\n",
       "1C_0      19969\n",
       "1D_0      20082\n",
       "2E_0      20004\n",
       "2F_0      19899\n",
       "2G_0      19928\n",
       "2H_0      20110\n",
       "3I_0      20078\n",
       "3J_0      19965\n",
       "3K_0      19846\n",
       "3L_0      20025\n",
       "s2 :       reads_raw  trim_adapter_bp_read1  trim_adapter_bp_read2  \\\n",
       "1A_0      19835                      0                      0   \n",
       "1B_0      20071                      0                      0   \n",
       "1C_0      19969                      0                      0   \n",
       "1D_0      20082                      0                      0   \n",
       "2E_0      20004                      0                      0   \n",
       "2F_0      19899                      0                      0   \n",
       "2G_0      19928                      0                      0   \n",
       "2H_0      20110                      0                      0   \n",
       "3I_0      20078                      0                      0   \n",
       "3J_0      19965                      0                      0   \n",
       "3K_0      19846                      0                      0   \n",
       "3L_0      20025                      0                      0   \n",
       "\n",
       "      trim_quality_bp_read1  trim_quality_bp_read2  reads_filtered_by_Ns  \\\n",
       "1A_0                      0                      0                     0   \n",
       "1B_0                      0                      0                     0   \n",
       "1C_0                      0                      0                     0   \n",
       "1D_0                      0                      0                     0   \n",
       "2E_0                      0                      0                     0   \n",
       "2F_0                      0                      0                     0   \n",
       "2G_0                      0                      0                     0   \n",
       "2H_0                      0                      0                     0   \n",
       "3I_0                      0                      0                     0   \n",
       "3J_0                      0                      0                     0   \n",
       "3K_0                      0                      0                     0   \n",
       "3L_0                      0                      0                     0   \n",
       "\n",
       "      reads_filtered_by_minlen  reads_passed_filter  \n",
       "1A_0                         0                19835  \n",
       "1B_0                         0                20071  \n",
       "1C_0                         0                19969  \n",
       "1D_0                         0                20082  \n",
       "2E_0                         0                20004  \n",
       "2F_0                         0                19899  \n",
       "2G_0                         0                19928  \n",
       "2H_0                         0                20110  \n",
       "3I_0                         0                20078  \n",
       "3J_0                         0                19965  \n",
       "3K_0                         0                19846  \n",
       "3L_0                         0                20025  \n",
       "s3 :       merged_pairs  clusters_total  clusters_hidepth  avg_depth_total  \\\n",
       "1A_0         19835             500               500           19.956   \n",
       "1B_0         20071             500               500           20.144   \n",
       "1C_0         19969             500               500           20.108   \n",
       "1D_0         20082             500               500           19.914   \n",
       "2E_0         20004             500               500           19.994   \n",
       "2F_0         19899             500               500           19.786   \n",
       "2G_0         19928             500               500           19.936   \n",
       "2H_0         20110             500               500           20.062   \n",
       "3I_0         20078             500               500           20.022   \n",
       "3J_0         19965             500               500           19.832   \n",
       "3K_0         19846             500               500           19.800   \n",
       "3L_0         20025             500               500           20.046   \n",
       "\n",
       "      avg_depth_mj  avg_depth_stat  sd_depth_total  sd_depth_mj  \\\n",
       "1A_0        19.956          19.956        2.694079     2.694079   \n",
       "1B_0        20.144          20.144        2.854341     2.854341   \n",
       "1C_0        20.108          20.108        2.746695     2.746695   \n",
       "1D_0        19.914          19.914        2.861923     2.861923   \n",
       "2E_0        19.994          19.994        2.828067     2.828067   \n",
       "2F_0        19.786          19.786        2.879619     2.879619   \n",
       "2G_0        19.936          19.936        2.994646     2.994646   \n",
       "2H_0        20.062          20.062        2.876483     2.876483   \n",
       "3I_0        20.022          20.022        2.789537     2.789537   \n",
       "3J_0        19.832          19.832        2.924342     2.924342   \n",
       "3K_0        19.800          19.800        2.802856     2.802856   \n",
       "3L_0        20.046          20.046        2.799979     2.799979   \n",
       "\n",
       "      sd_depth_stat  filtered_bad_align  \n",
       "1A_0       2.694079                   0  \n",
       "1B_0       2.854341                   0  \n",
       "1C_0       2.746695                   0  \n",
       "1D_0       2.861923                   0  \n",
       "2E_0       2.828067                   0  \n",
       "2F_0       2.879619                   0  \n",
       "2G_0       2.994646                   0  \n",
       "2H_0       2.876483                   0  \n",
       "3I_0       2.789537                   0  \n",
       "3J_0       2.924342                   0  \n",
       "3K_0       2.802856                   0  \n",
       "3L_0       2.799979                   0  \n",
       "s4 :       hetero_est  error_est\n",
       "1A_0    0.001792   0.000785\n",
       "1B_0    0.001959   0.000769\n",
       "1C_0    0.001835   0.000715\n",
       "1D_0    0.001540   0.000714\n",
       "2E_0    0.002120   0.000742\n",
       "2F_0    0.001826   0.000757\n",
       "2G_0    0.002056   0.000727\n",
       "2H_0    0.002176   0.000683\n",
       "3I_0    0.001933   0.000706\n",
       "3J_0    0.001724   0.000726\n",
       "3K_0    0.001869   0.000755\n",
       "3L_0    0.002099   0.000723\n",
       "s5 :       clusters_total  filtered_by_depth  filtered_by_maxH  filtered_by_maxN  \\\n",
       "1A_0             500                  0                 0                 0   \n",
       "1B_0             500                  0                 0                 0   \n",
       "1C_0             500                  0                 0                 0   \n",
       "1D_0             500                  0                 0                 0   \n",
       "2E_0             500                  0                 0                 0   \n",
       "2F_0             500                  0                 0                 0   \n",
       "2G_0             500                  0                 0                 0   \n",
       "2H_0             500                  0                 0                 0   \n",
       "3I_0             500                  0                 0                 0   \n",
       "3J_0             500                  0                 0                 0   \n",
       "3K_0             500                  0                 0                 0   \n",
       "3L_0             500                  0                 0                 0   \n",
       "\n",
       "      reads_consens  nsites  nhetero  heterozygosity  \n",
       "1A_0            500   97500      165        0.001692  \n",
       "1B_0            500   97500      180        0.001846  \n",
       "1C_0            500   97500      168        0.001723  \n",
       "1D_0            500   97500      142        0.001456  \n",
       "2E_0            500   97500      196        0.002010  \n",
       "2F_0            500   97500      169        0.001733  \n",
       "2G_0            500   97500      188        0.001928  \n",
       "2H_0            500   97500      201        0.002062  \n",
       "3I_0            500   97499      178        0.001826  \n",
       "3J_0            500   97500      156        0.001600  \n",
       "3K_0            500   97500      175        0.001795  \n",
       "3L_0            500   97500      192        0.001969  \n",
       "s7_filters :                             total_filters  applied_order  retained_loci\n",
       "total_prefiltered_loci                500              0            500\n",
       "filtered_by_rm_duplicates               0              0            500\n",
       "filtered_by_max_indels                  0              0            500\n",
       "filtered_by_max_snps                    0              0            500\n",
       "filtered_by_max_shared_het              0              0            500\n",
       "filtered_by_min_sample                  0              0            500\n",
       "filtered_by_max_alleles                 2              2            498\n",
       "total_filtered_loci                   498              0            498\n",
       "s7_loci :     locus_coverage  sum_coverage\n",
       "1                0             0\n",
       "2                0             0\n",
       "3                0             0\n",
       "4                0             0\n",
       "5                0             0\n",
       "6                0             0\n",
       "7                0             0\n",
       "8                0             0\n",
       "9                0             0\n",
       "10               0             0\n",
       "11               0             0\n",
       "12             498           498\n",
       "s7_samples :       sample_coverage\n",
       "1A_0              498\n",
       "1B_0              498\n",
       "1C_0              498\n",
       "1D_0              498\n",
       "2E_0              498\n",
       "2F_0              498\n",
       "2G_0              498\n",
       "2H_0              498\n",
       "3I_0              498\n",
       "3J_0              498\n",
       "3K_0              498\n",
       "3L_0              498\n",
       "s7_snps :     var  sum_var  pis  sum_pis\n",
       "0     0        0   55        0\n",
       "1     0        0  132      132\n",
       "2     1        2  122      376\n",
       "3    15       47   88      640\n",
       "4    12       95   55      860\n",
       "5    32      255   25      985\n",
       "..  ...      ...  ...      ...\n",
       "16    8     4322    0     1118\n",
       "17    1     4339    0     1118\n",
       "18    1     4357    0     1118\n",
       "19    0     4357    0     1118\n",
       "20    1     4377    0     1118\n",
       "21    1     4398    0     1118\n",
       "\n",
       "[22 rows x 4 columns]\n"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data2.stats_dfs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### pairgbs w/ merged reads example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## create an assembly for denovo\n",
    "data1 = ip.Assembly(\"denovo\")\n",
    "data1.set_params(1, \"tests4\")\n",
    "data1.set_params(2, os.path.join(DIR, \"pairgbs_wmerge_example_*.fastq.gz\"))\n",
    "data1.set_params(3, os.path.join(DIR, \"pairgbs_wmerge_example_barcodes.txt\"))\n",
    "data1.set_params(7, \"pairgbs\")\n",
    "data1.set_params(8, (\"TGCAG\", \"TGCAG\"))\n",
    "\n",
    "##...\n",
    "\n",
    "\n",
    "## branch into an assembly for denovo+reference\n",
    "data3 = data1.branch(\"denovo-plus\")\n",
    "data3.set_params(5, 'denovo+reference')\n",
    "\n",
    "## branch into an assembly for reference\n",
    "data4 = data1.branch(\"denovo-minus\")\n",
    "data4.set_params(5, 'denovo-reference')\n",
    "\n",
    "## assemble both\n",
    "data1.run(\"1234567\", force=True)\n",
    "data2.run(\"1234567\", force=True)\n",
    "data3.run(\"1234567\", force=True)\n",
    "data4.run(\"1234567\", force=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## check results\n",
    "assert all(data1.stats.clusters_hidepth == 1000)\n",
    "assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000\n",
    "\n",
    "assert all(data2.stats.clusters_hidepth == N_INSERTS)\n",
    "assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS\n",
    "\n",
    "assert all(data3.stats.clusters_hidepth == 1000)\n",
    "assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000\n",
    "\n",
    "assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)\n",
    "assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create zipped archive of sim data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "./ipsimdata/\n",
      "./ipsimdata/pairgbs_example_R2_.fastq.gz\n",
      "./ipsimdata/pairgbs_wmerge_example_barcodes.txt\n",
      "./ipsimdata/rad_example_genome.fa\n",
      "./ipsimdata/pairddrad_example_genome.fa\n",
      "./ipsimdata/pairgbs_example_R1_.fastq.gz\n",
      "./ipsimdata/pairgbs_wmerge_example_R2_.fastq.gz\n",
      "./ipsimdata/rad_example_genome.fa.fai\n",
      "./ipsimdata/pairddrad_example_R2_.fastq.gz\n",
      "./ipsimdata/pairddrad_example_genome.fa.sma\n",
      "./ipsimdata/pairddrad_example_genome.fa.fai\n",
      "./ipsimdata/pairgbs_wmerge_example_genome.fa\n",
      "./ipsimdata/pairddrad_wmerge_example_genome.fa\n",
      "./ipsimdata/pairddrad_example_genome.fa.smi\n",
      "./ipsimdata/pairgbs_wmerge_example_R1_.fastq.gz\n",
      "./ipsimdata/rad_example_genome.fa.smi\n",
      "./ipsimdata/gbs_example_barcodes.txt\n",
      "./ipsimdata/pairgbs_example_barcodes.txt\n",
      "./ipsimdata/pairddrad_example_R1_.fastq.gz\n",
      "./ipsimdata/pairddrad_wmerge_example_barcodes.txt\n",
      "./ipsimdata/rad_example_barcodes.txt\n",
      "./ipsimdata/pairddrad_wmerge_example_R1_.fastq.gz\n",
      "./ipsimdata/pairddrad_wmerge_example_R2_.fastq.gz\n",
      "./ipsimdata/gbs_example_R1_.fastq.gz\n",
      "./ipsimdata/pairddrad_example_barcodes.txt\n",
      "./ipsimdata/rad_example_genome.fa.sma\n",
      "./ipsimdata/rad_example_R1_.fastq.gz\n",
      "./ipsimdata/gbs_example_genome.fa\n"
     ]
    }
   ],
   "source": [
    "%%bash -s $DIR\n",
    "## compressed dir/ w/ all data files\n",
    "tar -zcvf ipsimdata.tar.gz $1 "
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}