{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Simulate RAD-seq data\n",
    "The simulations software simrrls is available at [github.com/dereneaton/simrrls](github.com/dereneaton/simrrls). First we create a directory called `ipsimdata/` and then simulate data and put it in this directory. Simrrls has a few dependencies that are required. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Global variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## name for our sim data directory\n",
    "DIR = \"./ipsimdata\"\n",
    "\n",
    "## A mouse MT genome used to stick our data into.\n",
    "INPUT_CHR = \"/home/deren/Downloads/MusMT.fa\"\n",
    "\n",
    "## number of RAD loci to simulate\n",
    "NLOCI = 1000\n",
    "\n",
    "## number of inserts to reference genome and insert size\n",
    "N_INSERTS = 100\n",
    "INSERT_SIZE = 50"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set up / clean up directories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import shutil\n",
    "\n",
    "## rm sim dir if it exists, else create it.\n",
    "while 1:\n",
    "    if os.path.exists(DIR):\n",
    "        shutil.rmtree(DIR)\n",
    "    else:\n",
    "        os.mkdir(DIR)\n",
    "    ## make sure dir is finished removing\n",
    "    if os.path.exists(DIR):\n",
    "        break\n",
    "\n",
    "## rm testdirs if they exist\n",
    "TESTDIRS = [\"./testref1\", \"./testref2\", \"./testref3\", \"./testref4\"]\n",
    "for testdir in TESTDIRS:\n",
    "    if os.path.exists(testdir):\n",
    "        shutil.rmtree(testdir)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Simulate the RAD data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "simrrls 0.0.11\n"
     ]
    }
   ],
   "source": [
    "import simrrls\n",
    "print 'simrrls', simrrls.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "simrrls -o ./ipsimdata/rad_example -f rad -dm 10 -ds 2 \n",
      "        -I 0.01 -L 1000 -i1 50 -i2 100 \n",
      "\n",
      "simrrls -o ./ipsimdata/gbs_example -f gbs -dm 10 -ds 2 \n",
      "        -I 0.01 -L 1000 -i1 50 -i2 100 \n",
      "\n",
      "simrrls -o ./ipsimdata/pairddrad_example -f pairddrad -dm 10 -ds 2 \n",
      "        -I 0.01 -L 1000 -i1 50 -i2 100 \n",
      "\n",
      "simrrls -o ./ipsimdata/pairgbs_example -f pairgbs -dm 10 -ds 2 \n",
      "        -I 0.01 -L 1000 -i1 50 -i2 100 \n",
      "\n",
      "simrrls -o ./ipsimdata/pairddrad_wmerge_example -f pairddrad -dm 10 -ds 2 \n",
      "        -I 0.01 -L 1000 -i1 -50 -i2 50 \n",
      "\n",
      "simrrls -o ./ipsimdata/pairgbs_wmerge_example -f pairgbs -dm 10 -ds 2 \n",
      "        -I 0.01 -L 1000 -i1 -50 -i2 50 \n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'\\tmin insert size allows read overlaps/adapter sequences \\n'"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import subprocess\n",
    "\n",
    "## this the bash command-line call to simrrls\n",
    "cmd = \"\"\"\\\n",
    "simrrls -o {odir}/{oname} -f {form} -dm 10 -ds 2 \n",
    "        -I 0.01 -L {nloci} -i1 {imin} -i2 {imax} \n",
    "\"\"\"\n",
    "        \n",
    "## simulate rad_example (includes indels)\n",
    "call = cmd.format(odir=DIR, oname=\"rad_example\", form=\"rad\", \n",
    "                  imin=50, imax=100, nloci=NLOCI)\n",
    "print call\n",
    "subprocess.check_output(call.split())\n",
    "\n",
    "## simulate gbs_example (includes indels)\n",
    "call = cmd.format(odir=DIR, oname=\"gbs_example\", form=\"gbs\", \n",
    "                  imin=50, imax=100, nloci=NLOCI)\n",
    "print call\n",
    "subprocess.check_output(call.split())\n",
    "\n",
    "## simulate pairddrad_example (includes indels)\n",
    "call = cmd.format(odir=DIR, oname=\"pairddrad_example\", form=\"pairddrad\", \n",
    "                  imin=50, imax=100, nloci=NLOCI)\n",
    "print call\n",
    "subprocess.check_output(call.split())\n",
    "\n",
    "## simulate gbs_example (includes indels)\n",
    "call = cmd.format(odir=DIR, oname=\"pairgbs_example\", form=\"pairgbs\",\n",
    "                  imin=50, imax=100, nloci=NLOCI)\n",
    "print call\n",
    "subprocess.check_output(call.split())\n",
    "\n",
    "## simulate pairddrad_example (includes indels and merged reads)\n",
    "call = cmd.format(odir=DIR, oname=\"pairddrad_wmerge_example\", form=\"pairddrad\", \n",
    "                  imin=-50, imax=50, nloci=NLOCI)\n",
    "print call\n",
    "subprocess.check_output(call.split())\n",
    "\n",
    "## simulate gbs_example (includes indels and merged reads)\n",
    "call = cmd.format(odir=DIR, oname=\"pairgbs_wmerge_example\", form=\"pairgbs\", \n",
    "                  imin=-50, imax=50, nloci=NLOCI)\n",
    "print call\n",
    "subprocess.check_output(call.split())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Manufacture a psuedo-genome with hits from sim data\n",
    "\n",
    "These functions take simulated rad-data and a \"large\" input genome (really it could just be a randomly generated fasta), and randomly inserts a handful of simulated rad tags into the genome. This guarantees that reference mapping will actually do something. For PE simulated data R2 reads are reversed before they're inserted, because smalt is using the `-l pe` flag, which looks for reads in this orientation __`--> <--`__. Also, for PE inner mate distance is fixed at 50. If you wanna get ambitious you could draw this value from a distribution, but seems like more effort than it's worth. This wants to be run from __`ipsimdata/`__, but you can run it from anywhere if you update the paths.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import itertools\n",
    "import gzip\n",
    "import random\n",
    "from Bio import SeqIO"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Function to insert reads into simulated genome"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## Utility function\n",
    "def revcomp(sequence):\n",
    "    \"returns reverse complement of a string\"\n",
    "    sequence = sequence[::-1].strip()\\\n",
    "                             .replace(\"A\", \"t\")\\\n",
    "                             .replace(\"T\", \"a\")\\\n",
    "                             .replace(\"C\", \"g\")\\\n",
    "                             .replace(\"G\", \"c\").upper()\n",
    "    return sequence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def RAD_to_genome(R1s, R2s, n_inserts, insert_sz, input_chr, out_chr):\n",
    "    \"\"\" \n",
    "    Writes simulated rad data into a genome fasta file. \n",
    "    Assumes RAD data file has names formatted like the \n",
    "    output from simrrls.\n",
    "    \"\"\"\n",
    "    ## read in the full genome file\n",
    "    record = SeqIO.read(input_chr, \"fasta\")\n",
    "    lenchr = len(record.seq)\n",
    "\n",
    "    ## read in the RAD data files\n",
    "    dat1 = gzip.open(R1s, 'r')\n",
    "    qiter1 = itertools.izip(*[iter(dat1)]*4)\n",
    "    if R2s:\n",
    "        dat2 = gzip.open(R2s, 'r')\n",
    "        qiter2 = itertools.izip(*[iter(dat2)]*4)\n",
    "    else:\n",
    "        qiter2 = itertools.izip(*[iter(str, 1)]*4)\n",
    "       \n",
    "    ## sample unique reads from rads\n",
    "    uniqs = []\n",
    "    locid = 0\n",
    "    while len(uniqs) < n_inserts:\n",
    "        ## grab a read and get locus id\n",
    "        qrt1 = qiter1.next()\n",
    "        qrt2 = qiter2.next()\n",
    "        iloc = []\n",
    "        ilocid = int(qrt1[0].split(\"_\")[1][5:])\n",
    "        ## go until end of locus copies\n",
    "        while ilocid == locid:\n",
    "            iloc.append([qrt1[1].strip(), qrt2[1].strip()])\n",
    "            qrt1 = qiter1.next()\n",
    "            qrt2 = qiter2.next()\n",
    "            ilocid = int(qrt1[0].split(\"_\")[1][5:])\n",
    "        ## sample one read\n",
    "        uniqs.append(random.sample(iloc, 1)[0])\n",
    "        locid += 1\n",
    "        \n",
    "    ## insert RADs into genome\n",
    "    sloc = 100\n",
    "    for ins in range(n_inserts):   \n",
    "        ## get read, we leave the barcode on cuz it won't hurt\n",
    "        r1 = uniqs[ins][0]\n",
    "        r2 = uniqs[ins][1]\n",
    "        if not r2:\n",
    "            record.seq = record.seq[:sloc]+r1+\\\n",
    "                         record.seq[sloc:]\n",
    "        else: \n",
    "            record.seq = record.seq[:sloc]+r1+\\\n",
    "                         record.seq[sloc:sloc+insert_sz]+\\\n",
    "                         revcomp(r2)+\\\n",
    "                         record.seq[sloc+insert_sz:]          \n",
    "        sloc += 300\n",
    "            \n",
    "    ## write to file\n",
    "    rlen = len(qrt1[1].strip())\n",
    "    if r2: \n",
    "        rlen *= 2\n",
    "    print(\"input genome is {} bp\".format(lenchr))\n",
    "    print('imputed {} loci {} bp in len'.format(n_inserts, rlen))\n",
    "    print(\"new pseudo-genome is {} bp\".format(len(record.seq)))\n",
    "    output_handle = open(out_chr, \"w\")\n",
    "    SeqIO.write(record, output_handle, \"fasta\")\n",
    "    output_handle.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Make pseudo-ref data files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input genome is 16299 bp\n",
      "imputed 100 loci 100 bp in len\n",
      "new pseudo-genome is 26299 bp\n"
     ]
    }
   ],
   "source": [
    "## SE RAD data\n",
    "DATA_R1 = DIR+\"/rad_example_R1_.fastq.gz\"\n",
    "OUTPUT_CHR = DIR+\"/rad_example_genome.fa\"\n",
    "RAD_to_genome(DATA_R1, 0, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input genome is 16299 bp\n",
      "imputed 100 loci 100 bp in len\n",
      "new pseudo-genome is 26299 bp\n"
     ]
    }
   ],
   "source": [
    "## SE GBS data\n",
    "DATA_R1 = DIR+\"/gbs_example_R1_.fastq.gz\"\n",
    "OUTPUT_CHR = DIR+\"/gbs_example_genome.fa\"\n",
    "RAD_to_genome(DATA_R1, 0, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input genome is 16299 bp\n",
      "imputed 100 loci 200 bp in len\n",
      "new pseudo-genome is 36299 bp\n"
     ]
    }
   ],
   "source": [
    "## PAIR ddRAD data \n",
    "DATA_R1 = DIR+\"/pairddrad_wmerge_example_R1_.fastq.gz\"\n",
    "DATA_R2 = DIR+\"/pairddrad_wmerge_example_R2_.fastq.gz\"\n",
    "OUTPUT_CHR = DIR+\"/pairddrad_wmerge_example_genome.fa\"\n",
    "RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "input genome is 16299 bp\n",
      "imputed 100 loci 200 bp in len\n",
      "new pseudo-genome is 36299 bp\n"
     ]
    }
   ],
   "source": [
    "## PAIR GBS data\n",
    "DATA_R1 = DIR+\"/pairgbs_wmerge_example_R1_.fastq.gz\"\n",
    "DATA_R2 = DIR+\"/pairgbs_wmerge_example_R2_.fastq.gz\"\n",
    "OUTPUT_CHR = DIR+\"/pairgbs_wmerge_example_genome.fa\"\n",
    "RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Run tests\n",
    "### rad_example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  New Assembly: denovo\n",
      "\n",
      "  Assembly: denovo\n",
      "  [####################] 100%  sorting reads         | 0:00:04 \n",
      "  [####################] 100%  writing files         | 0:00:00 \n",
      "  [####################] 100%  processing reads      | 0:00:40 \n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  clustering            | 0:00:01 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:00:50 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "  [####################] 100%  inferring [H, E]      | 0:01:12 \n",
      "  [####################] 100%  consensus calling     | 0:00:38 \n",
      "  [####################] 100%  concat/shuf input     | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:09 \n",
      "  [####################] 100%  ordering clusters     | 0:00:17 \n",
      "  [####################] 100%  building database     | 0:00:09 \n",
      "  [####################] 100%  filtering loci        | 0:00:02 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:11 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/testref1/denovo_outfiles\n",
      "\n",
      "  Assembly: reference\n",
      "  [####################] 100%  sorting reads         | 0:00:04 \n",
      "  [####################] 100%  writing files         | 0:00:00 \n",
      "  [####################] 100%  processing reads      | 0:00:52 \n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  mapping               | 0:00:01 \n",
      "  [####################] 100%  finalize mapping      | 0:00:11 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:00:06 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "  [####################] 100%  inferring [H, E]      | 0:00:25 \n",
      "  [####################] 100%  consensus calling     | 0:00:04 \n",
      "  [####################] 100%  concat/shuf input     | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:02 \n",
      "  [####################] 100%  ordering clusters     | 0:00:01 \n",
      "  [####################] 100%  building database     | 0:00:00 \n",
      "  [####################] 100%  filtering loci        | 0:00:00 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:01 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/testref1/reference_outfiles\n"
     ]
    }
   ],
   "source": [
    "import ipyrad as ip\n",
    "\n",
    "## create an assembly for denovo\n",
    "data1 = ip.Assembly(\"denovo\")\n",
    "data1.set_params(1, \"testref1\")\n",
    "data1.set_params(2, DIR+'/rad_example_R1_.fastq.gz')\n",
    "data1.set_params(3, DIR+'/rad_example_barcodes.txt')\n",
    "\n",
    "## branch into an assembly for reference\n",
    "data2 = data1.branch(\"reference\")\n",
    "data2.set_params(5, 'reference')\n",
    "data2.set_params(6, DIR+'/rad_example_genome.fa')\n",
    "\n",
    "## assemble both\n",
    "data1.run(force=True)\n",
    "data2.run(force=True)\n",
    "\n",
    "## check results\n",
    "assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI\n",
    "assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### gbs example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  New Assembly: denovo\n",
      "\n",
      "  Assembly: denovo\n",
      "  [####################] 100%  sorting reads         | 0:00:04 \n",
      "  [####################] 100%  writing files         | 0:00:00 \n",
      "  [####################] 100%  processing reads      | 0:00:52 \n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  clustering            | 0:00:00 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:01:19 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "  [####################] 100%  inferring [H, E]      | 0:01:39 \n",
      "  [####################] 100%  consensus calling     | 0:00:47 \n",
      "  [####################] 100%  concat/shuf input     | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:01 \n",
      "  [####################] 100%  aligning clusters     | 0:00:10 \n",
      "  [####################] 100%  ordering clusters     | 0:00:17 \n",
      "  [####################] 100%  building database     | 0:00:08 \n",
      "  [####################] 100%  filtering loci        | 0:00:03 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:12 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/testref2/denovo_outfiles\n",
      "\n",
      "  Assembly: reference\n",
      "  [####################] 100%  sorting reads         | 0:00:04 \n",
      "  [####################] 100%  writing files         | 0:00:00 \n",
      "  [####################] 100%  processing reads      | 0:00:51 \n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  mapping               | 0:00:01 \n",
      "  [####################] 100%  finalize mapping      | 0:00:11 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:00:05 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "  [####################] 100%  inferring [H, E]      | 0:00:24 \n",
      "  [####################] 100%  consensus calling     | 0:00:05 \n",
      "  [####################] 100%  concat/shuf input     | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:02 \n",
      "  [####################] 100%  ordering clusters     | 0:00:01 \n",
      "  [####################] 100%  building database     | 0:00:00 \n",
      "  [####################] 100%  filtering loci        | 0:00:00 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:01 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/testref2/reference_outfiles\n"
     ]
    }
   ],
   "source": [
    "import ipyrad as ip\n",
    "\n",
    "## create an assembly for denovo\n",
    "data1 = ip.Assembly(\"denovo\")\n",
    "data1.set_params(1, \"testref2\")\n",
    "data1.set_params(2, DIR+'/gbs_example_R1_.fastq.gz')\n",
    "data1.set_params(3, DIR+'/gbs_example_barcodes.txt')\n",
    "\n",
    "## branch into an assembly for reference\n",
    "data2 = data1.branch(\"reference\")\n",
    "data2.set_params(5, 'reference')\n",
    "data2.set_params(6, DIR+'/gbs_example_genome.fa')\n",
    "\n",
    "## assemble both\n",
    "data1.run(force=True)\n",
    "data2.run(force=True)\n",
    "\n",
    "## check results\n",
    "assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI\n",
    "assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### pairddrad example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  New Assembly: denovo\n",
      "\n",
      "  Assembly: denovo\n",
      "  [####################] 100%  sorting reads         | 0:00:03 \n",
      "  [####################] 100%  writing files         | 0:00:00 \n",
      "  [####################] 100%  processing reads      | 0:00:51 \n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  clustering            | 0:00:00 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:01:07 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "  [####################] 100%  inferring [H, E]      | 0:01:33 \n",
      "  [####################] 100%  consensus calling     | 0:00:42 \n",
      "  [####################] 100%  concat/shuf input     | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:01 \n",
      "  [####################] 100%  aligning clusters     | 0:00:11 \n",
      "  [####################] 100%  ordering clusters     | 0:00:22 \n",
      "  [####################] 100%  building database     | 0:00:06 \n",
      "  [####################] 100%  filtering loci        | 0:00:04 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:14 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/testref3/denovo_outfiles\n",
      "\n",
      "  Assembly: reference\n",
      "  [####################] 100%  sorting reads         | 0:00:05 \n",
      "  [####################] 100%  writing files         | 0:00:00 \n",
      "  [####################] 100%  processing reads      | 0:01:06 \n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  mapping               | 0:00:01 \n",
      "  [####################] 100%  finalize mapping      | 0:00:14 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:00:07 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "  [####################] 100%  inferring [H, E]      | 0:00:27 \n",
      "  [####################] 100%  consensus calling     | 0:00:08 \n",
      "  [####################] 100%  concat/shuf input     | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:02 \n",
      "  [####################] 100%  ordering clusters     | 0:00:01 \n",
      "  [####################] 100%  building database     | 0:00:00 \n",
      "  [####################] 100%  filtering loci        | 0:00:00 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:01 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/testref3/reference_outfiles\n"
     ]
    }
   ],
   "source": [
    "import ipyrad as ip\n",
    "\n",
    "## create an assembly for denovo\n",
    "data1 = ip.Assembly(\"denovo\")\n",
    "data1.set_params(1, \"testref3\")\n",
    "data1.set_params(2, DIR+'/pairddrad_wmerge_example_R1_.fastq.gz')\n",
    "data1.set_params(3, DIR+'/pairddrad_wmerge_example_barcodes.txt')\n",
    "\n",
    "## branch into an assembly for reference\n",
    "data2 = data1.branch(\"reference\")\n",
    "data2.set_params(5, 'reference')\n",
    "data2.set_params(6, DIR+'/pairddrad_wmerge_example_genome.fa')\n",
    "\n",
    "## assemble both\n",
    "data1.run(force=True)\n",
    "data2.run(force=True)\n",
    "\n",
    "## check results\n",
    "assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI\n",
    "assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  New Assembly: denovo\n",
      "\n",
      "  Assembly: denovo\n",
      "  [####################] 100%  sorting reads         | 0:00:04 \n",
      "  [####################] 100%  writing files         | 0:00:00 \n",
      "  [####################] 100%  processing reads      | 0:00:40 \n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  clustering            | 0:00:00 \n",
      "  [####################] 100%  chunking              | 0:00:01 \n",
      "  [####################] 100%  aligning              | 0:00:45 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "  [####################] 100%  inferring [H, E]      | 0:00:58 \n",
      "  [####################] 100%  consensus calling     | 0:00:29 \n",
      "  [####################] 100%  concat/shuf input     | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:01 \n",
      "  [####################] 100%  aligning clusters     | 0:00:07 \n",
      "  [####################] 100%  ordering clusters     | 0:00:14 \n",
      "  [####################] 100%  building database     | 0:00:06 \n",
      "  [####################] 100%  filtering loci        | 0:00:03 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:09 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/testref4/denovo_outfiles\n",
      "\n",
      "  Assembly: reference\n",
      "  [####################] 100%  sorting reads         | 0:00:03 \n",
      "  [####################] 100%  writing files         | 0:00:00 \n",
      "  [####################] 100%  processing reads      | 0:00:41 \n",
      "  [####################] 100%  dereplicating         | 0:00:00 \n",
      "  [####################] 100%  mapping               | 0:00:01 \n",
      "  [####################] 100%  finalize mapping      | 0:00:10 \n",
      "  [####################] 100%  chunking              | 0:00:00 \n",
      "  [####################] 100%  aligning              | 0:00:06 \n",
      "  [####################] 100%  concatenating         | 0:00:00 \n",
      "  [####################] 100%  inferring [H, E]      | 0:00:19 \n",
      "  [####################] 100%  consensus calling     | 0:00:05 \n",
      "  [####################] 100%  concat/shuf input     | 0:00:00 \n",
      "  [####################] 100%  clustering across     | 0:00:00 \n",
      "  [####################] 100%  aligning clusters     | 0:00:02 \n",
      "  [####################] 100%  ordering clusters     | 0:00:01 \n",
      "  [####################] 100%  building database     | 0:00:00 \n",
      "  [####################] 100%  filtering loci        | 0:00:00 \n",
      "  [####################] 100%  building loci/stats   | 0:00:01 \n",
      "  [####################] 100%  building vcf file     | 0:00:01 \n",
      "  [####################] 100%  writing outfiles      | 0:00:01 \n",
      "  Outfiles written to: ~/Documents/ipyrad/tests/testref4/reference_outfiles\n"
     ]
    }
   ],
   "source": [
    "import ipyrad as ip\n",
    "\n",
    "## create an assembly for denovo\n",
    "data1 = ip.Assembly(\"denovo\")\n",
    "data1.set_params(1, \"testref4\")\n",
    "data1.set_params(2, DIR+'/pairgbs_wmerge_example_R1_.fastq.gz')\n",
    "data1.set_params(3, DIR+'/pairgbs_wmerge_example_barcodes.txt')\n",
    "\n",
    "## branch into an assembly for reference\n",
    "data2 = data1.branch(\"reference\")\n",
    "data2.set_params(5, 'reference')\n",
    "data2.set_params(6, DIR+'/pairgbs_wmerge_example_genome.fa')\n",
    "\n",
    "## assemble both\n",
    "data1.run(force=True)\n",
    "data2.run(force=True)\n",
    "\n",
    "## check results\n",
    "assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000\n",
    "assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Clean up test dirs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import glob\n",
    "import os\n",
    "\n",
    "## rm dir if it exists, else create it.\n",
    "for tdir in glob.glob(\"testref[1-9]\"):\n",
    "    shutil.rmtree(tdir)\n",
    "        \n",
    "## rm reference index\n",
    "for iref in glob.glob(DIR+\"/*_genome.fa.*\"):\n",
    "    os.remove(iref)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create zipped simdata archive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ipsimdata/gbs_example_barcodes.txt\n",
      "ipsimdata/gbs_example_genome.fa\n",
      "ipsimdata/gbs_example_R1_.fastq.gz\n",
      "ipsimdata/pairddrad_example_barcodes.txt\n",
      "ipsimdata/pairddrad_example_R1_.fastq.gz\n",
      "ipsimdata/pairddrad_example_R2_.fastq.gz\n",
      "ipsimdata/pairddrad_wmerge_example_barcodes.txt\n",
      "ipsimdata/pairddrad_wmerge_example_genome.fa\n",
      "ipsimdata/pairddrad_wmerge_example_R1_.fastq.gz\n",
      "ipsimdata/pairddrad_wmerge_example_R2_.fastq.gz\n",
      "ipsimdata/pairgbs_example_barcodes.txt\n",
      "ipsimdata/pairgbs_example_R1_.fastq.gz\n",
      "ipsimdata/pairgbs_example_R2_.fastq.gz\n",
      "ipsimdata/pairgbs_wmerge_example_barcodes.txt\n",
      "ipsimdata/pairgbs_wmerge_example_genome.fa\n",
      "ipsimdata/pairgbs_wmerge_example_R1_.fastq.gz\n",
      "ipsimdata/pairgbs_wmerge_example_R2_.fastq.gz\n",
      "ipsimdata/rad_example_barcodes.txt\n",
      "ipsimdata/rad_example_genome.fa\n",
      "ipsimdata/rad_example_R1_.fastq.gz\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "## compressed dir/ w/ all data files\n",
    "tar -zcvf ipsimdata.tar.gz ipsimdata/* "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}