{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ipyrad testing for pairddrad data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.1.37\n"
     ]
    }
   ],
   "source": [
    "import ipyrad as ip      ## for RADseq assembly\n",
    "print ip.__version__     ## print version\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## clear existing test_dir/\n",
    "import shutil\n",
    "import os\n",
    "if os.path.exists(\"./test_pairddrad/\"):\n",
    "    shutil.rmtree(\"./test_pairddrad/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  New Assembly: data1\n"
     ]
    }
   ],
   "source": [
    "data1 = ip.Assembly('data1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  0   assembly_name               data1                                        \n",
      "  1   project_dir                 .                                            \n",
      "  2   raw_fastq_path              ./*.fastq                                    \n",
      "  3   barcodes_path               ./*.barcodes.txt                             \n",
      "  4   sorted_fastq_path                                                        \n",
      "  5   assembly_method             denovo                                       \n",
      "  6   reference_sequence                                                       \n",
      "  7   datatype                    rad                                          \n",
      "  8   restriction_overhang        ('TGCAG', '')                                \n",
      "  9   max_low_qual_bases          5                                            \n",
      "  10  phred_Qscore_offset         33                                           \n",
      "  11  mindepth_statistical        6                                            \n",
      "  12  mindepth_majrule            6                                            \n",
      "  13  maxdepth                    1000                                         \n",
      "  14  clust_threshold             0.85                                         \n",
      "  15  max_barcode_mismatch        1                                            \n",
      "  16  filter_adapters             0                                            \n",
      "  17  filter_min_trim_len         35                                           \n",
      "  18  max_alleles_consens         2                                            \n",
      "  19  max_Ns_consens              (5, 5)                                       \n",
      "  20  max_Hs_consens              (8, 8)                                       \n",
      "  21  min_samples_locus           4                                            \n",
      "  22  max_SNPs_locus              (100, 100)                                   \n",
      "  23  max_Indels_locus            (5, 99)                                      \n",
      "  24  max_shared_Hs_locus         0.25                                         \n",
      "  25  edit_cutsites               (0, 0)                                       \n",
      "  26  trim_overhang               (1, 2, 2, 1)                                 \n",
      "  27  output_formats              *                                            \n",
      "  28  pop_assign_file                                                          \n",
      "  29  excludes                                                                 \n",
      "  30  outgroups                                                                \n"
     ]
    }
   ],
   "source": [
    "data1.get_params()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  0   assembly_name               data1                                        \n",
      "  1   project_dir                 ./test_pairddrad                             \n",
      "  2   raw_fastq_path              ./data/sim_pairddrad_*.gz                    \n",
      "  3   barcodes_path               ./data/sim_pairddrad_barcodes.txt            \n",
      "  4   sorted_fastq_path                                                        \n",
      "  5   assembly_method             denovo                                       \n",
      "  6   reference_sequence                                                       \n",
      "  7   datatype                    pairddrad                                    \n",
      "  8   restriction_overhang        ('TGCAG', 'AATT')                            \n",
      "  9   max_low_qual_bases          5                                            \n",
      "  10  phred_Qscore_offset         33                                           \n",
      "  11  mindepth_statistical        6                                            \n",
      "  12  mindepth_majrule            6                                            \n",
      "  13  maxdepth                    1000                                         \n",
      "  14  clust_threshold             0.85                                         \n",
      "  15  max_barcode_mismatch        1                                            \n",
      "  16  filter_adapters             0                                            \n",
      "  17  filter_min_trim_len         35                                           \n",
      "  18  max_alleles_consens         2                                            \n",
      "  19  max_Ns_consens              (5, 5)                                       \n",
      "  20  max_Hs_consens              (8, 8)                                       \n",
      "  21  min_samples_locus           4                                            \n",
      "  22  max_SNPs_locus              (100, 100)                                   \n",
      "  23  max_Indels_locus            (5, 99)                                      \n",
      "  24  max_shared_Hs_locus         0.25                                         \n",
      "  25  edit_cutsites               (0, 0)                                       \n",
      "  26  trim_overhang               (1, 2, 2, 1)                                 \n",
      "  27  output_formats              *                                            \n",
      "  28  pop_assign_file                                                          \n",
      "  29  excludes                                                                 \n",
      "  30  outgroups                                                                \n"
     ]
    }
   ],
   "source": [
    "data1.set_params('project_dir', \"./test_pairddrad\")\n",
    "data1.set_params('raw_fastq_path', \"./data/sim_pairddrad_*.gz\")\n",
    "data1.set_params('barcodes_path', \"./data/sim_pairddrad_barcodes.txt\")\n",
    "data1.set_params('restriction_overhang', (\"TGCAG\", \"AATT\"))\n",
    "data1.set_params('datatype', 'pairddrad')\n",
    "#data1.set_params(17, 1)\n",
    "\n",
    "data1.get_params()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#data1.link_fastqs(path=\"test_pairddrad/test_pairddrad_fastqs/\", append=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  Saving current assembly.\n",
      "     state  reads_raw\n",
      "1A0      1      20000\n",
      "1B0      1      20000\n",
      "1C0      1      20000\n",
      "1D0      1      20000\n",
      "2E0      1      20000\n",
      "2F0      1      20000\n",
      "2G0      1      20000\n",
      "2H0      1      20000\n",
      "3I0      1      20000\n",
      "3J0      1      20000\n",
      "3K0      1      20000\n",
      "3L0      1      20000\n"
     ]
    }
   ],
   "source": [
    "data1.step1()\n",
    "print data1.stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  Saving current assembly.\n",
      "     state  reads_raw  reads_filtered\n",
      "1A0      2      20000           20000\n",
      "1B0      2      20000           20000\n",
      "1C0      2      20000           20000\n",
      "1D0      2      20000           20000\n",
      "2E0      2      20000           20000\n",
      "2F0      2      20000           20000\n",
      "2G0      2      20000           20000\n",
      "2H0      2      20000           20000\n",
      "3I0      2      20000           20000\n",
      "3J0      2      20000           20000\n",
      "3K0      2      20000           20000\n",
      "3L0      2      20000           20000\n"
     ]
    }
   ],
   "source": [
    "data1.step2()#[\"1B0\", \"2H0\", \"3J0\", \"3K0\"], force=True)\n",
    "print data1.stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  Saving current assembly.\n",
      "     state  reads_raw  reads_filtered  clusters_total  clusters_hidepth\n",
      "1A0      3      20000           20000            1000              1000\n",
      "1B0      3      20000           20000            1000              1000\n",
      "1C0      3      20000           20000            1000              1000\n",
      "1D0      3      20000           20000            1000              1000\n",
      "2E0      3      20000           20000            1000              1000\n",
      "2F0      3      20000           20000            1000              1000\n",
      "2G0      3      20000           20000            1000              1000\n",
      "2H0      3      20000           20000            1000              1000\n",
      "3I0      3      20000           20000            1000              1000\n",
      "3J0      3      20000           20000            1000              1000\n",
      "3K0      3      20000           20000            1000              1000\n",
      "3L0      3      20000           20000            1000              1000\n"
     ]
    }
   ],
   "source": [
    "# data1.step3()                        ## do all samples\n",
    "# data1.step3(\"1A0\")                   ## do one sample\n",
    "# data1.step3([\"1A0\", \"1B0\", \"1C0\"])   ## do list of samples\n",
    "data1.step3()#[\"1B0\", \"2H0\", \"3J0\", \"3K0\"], force=True) \n",
    "print data1.stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    skipping 1B0; already estimated. Use force=True to overwrite.\n",
      "    skipping 2H0; already estimated. Use force=True to overwrite.\n",
      "    skipping 3J0; already estimated. Use force=True to overwrite.\n",
      "    skipping 3K0; already estimated. Use force=True to overwrite.\n",
      "  Saving current assembly.\n",
      "     state  reads_raw  reads_filtered  clusters_total  clusters_hidepth  \\\n",
      "1A0      4      20000           20000            1000              1000   \n",
      "1B0      4      20000           20000            1000              1000   \n",
      "1C0      4      20000           20000            1000              1000   \n",
      "1D0      4      20000           20000            1000              1000   \n",
      "2E0      4      20000           20000            1000              1000   \n",
      "2F0      4      20000           20000            1000              1000   \n",
      "2G0      4      20000           20000            1000              1000   \n",
      "2H0      4      20000           20000            1000              1000   \n",
      "3I0      4      20000           20000            1000              1000   \n",
      "3J0      4      20000           20000            1000              1000   \n",
      "3K0      4      20000           20000            1000              1000   \n",
      "3L0      4      20000           20000            1000              1000   \n",
      "\n",
      "     hetero_est  error_est  \n",
      "1A0    0.001308   0.000488  \n",
      "1B0    0.001422   0.000494  \n",
      "1C0    0.001427   0.000482  \n",
      "1D0    0.001286   0.000497  \n",
      "2E0    0.001226   0.000491  \n",
      "2F0    0.001406   0.000479  \n",
      "2G0    0.001497   0.000506  \n",
      "2H0    0.001460   0.000502  \n",
      "3I0    0.001432   0.000504  \n",
      "3J0    0.001464   0.000483  \n",
      "3K0    0.001412   0.000502  \n",
      "3L0    0.001282   0.000470  \n",
      "CPU times: user 331 ms, sys: 4.28 ms, total: 336 ms\n",
      "Wall time: 7.07 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "data1.step4()#[\"1B0\", \"2H0\", \"3J0\", \"3K0\"], force=True) \n",
    "print data1.stats\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DEBUG:ipyrad:H4CKERZ-mode: __loglevel__ = DEBUG\n",
      "INFO:ipyrad.core.parallel:Local connection to 4 engines [ipyrad-2480]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Assembly: test_pairddrad  [test_pairddrad/test_pairddrad.assembly]\n",
      "ipyparallel setup: Local connection to 4 engines\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#data1.save(\"upto2\")\n",
    "## still figuring out how best to save...\n",
    "import ipyrad as ip\n",
    "data1 = ip.load_assembly(\"test_pairddrad/test_pairddrad.assembly\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  Saving current assembly.\n",
      "     state  reads_raw  reads_filtered  clusters_total  clusters_hidepth  \\\n",
      "1A0      5      20000           20000            1000              1000   \n",
      "1B0      5      20000           20000            1000              1000   \n",
      "1C0      5      20000           20000            1000              1000   \n",
      "1D0      5      20000           20000            1000              1000   \n",
      "2E0      5      20000           20000            1000              1000   \n",
      "2F0      5      20000           20000            1000              1000   \n",
      "2G0      5      20000           20000            1000              1000   \n",
      "2H0      5      20000           20000            1000              1000   \n",
      "3I0      5      20000           20000            1000              1000   \n",
      "3J0      5      20000           20000            1000              1000   \n",
      "3K0      5      20000           20000            1000              1000   \n",
      "3L0      5      20000           20000            1000              1000   \n",
      "\n",
      "     hetero_est  error_est  reads_consens  \n",
      "1A0    0.001308   0.000488           1000  \n",
      "1B0    0.001422   0.000494           1000  \n",
      "1C0    0.001427   0.000482           1000  \n",
      "1D0    0.001286   0.000497           1000  \n",
      "2E0    0.001226   0.000491           1000  \n",
      "2F0    0.001406   0.000479           1000  \n",
      "2G0    0.001497   0.000506           1000  \n",
      "2H0    0.001460   0.000502           1000  \n",
      "3I0    0.001432   0.000504           1000  \n",
      "3J0    0.001464   0.000483           1000  \n",
      "3K0    0.001412   0.000502           1000  \n",
      "3L0    0.001282   0.000470           1000  \n"
     ]
    }
   ],
   "source": [
    "data1.step5()#[\"1B0\"], force=True) \n",
    "print data1.stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "data1.step5([\"1B0\", \"2H0\", \"3J0\", \"3K0\"], force=True) \n",
    "print data1.stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  Saving current assembly.\n",
      "     state  reads_raw  reads_filtered  clusters_total  clusters_hidepth  \\\n",
      "1A0      6      20000           20000            1000              1000   \n",
      "1B0      6      20000           20000            1000              1000   \n",
      "1C0      6      20000           20000            1000              1000   \n",
      "1D0      6      20000           20000            1000              1000   \n",
      "2E0      6      20000           20000            1000              1000   \n",
      "2F0      6      20000           20000            1000              1000   \n",
      "2G0      6      20000           20000            1000              1000   \n",
      "2H0      6      20000           20000            1000              1000   \n",
      "3I0      6      20000           20000            1000              1000   \n",
      "3J0      6      20000           20000            1000              1000   \n",
      "3K0      6      20000           20000            1000              1000   \n",
      "3L0      6      20000           20000            1000              1000   \n",
      "\n",
      "     hetero_est  error_est  reads_consens  \n",
      "1A0    0.001308   0.000488           1000  \n",
      "1B0    0.001422   0.000494           1000  \n",
      "1C0    0.001427   0.000482           1000  \n",
      "1D0    0.001286   0.000497           1000  \n",
      "2E0    0.001226   0.000491           1000  \n",
      "2F0    0.001406   0.000479           1000  \n",
      "2G0    0.001497   0.000506           1000  \n",
      "2H0    0.001460   0.000502           1000  \n",
      "3I0    0.001432   0.000504           1000  \n",
      "3J0    0.001464   0.000483           1000  \n",
      "3K0    0.001412   0.000502           1000  \n",
      "3L0    0.001282   0.000470           1000  \n"
     ]
    }
   ],
   "source": [
    "data1.step6()#([\"1B0\", \"2H0\", \"3J0\", \"3K0\"], force=True) \n",
    "print data1.stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "inc ['1B0', '2G0', '1C0', '1A0', '2H0', '2E0', '3L0', '3I0', '1D0', '2F0', '3J0', '3K0']\n",
      "sidx:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]\n",
      "finished filtering\n",
      "  Saving current assembly.\n"
     ]
    }
   ],
   "source": [
    "data1.step7()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'loci': '/home/deren/Documents/ipyrad/tests/test_pairddrad/data1_outfiles/data1.loci'}"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data1.outfiles\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "less $data1.outfiles.loci"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "print pd.read_table('test_pairddrad/test_pairddrad_consens/s5_consens.txt', delim_whitespace=1, header=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## how to merge Assembly objects\n",
    "data1.files.edits['1B0']\n",
    "data1.samples['1B0'].files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print data1.stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for i in data1.log:\n",
    "    print i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "adds = np.ones([10, 4], dtype='int16')\n",
    "adds.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "np.array.([30, 1], dtype='int16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "longest_reads = 190\n",
    "nreads = int(1e5)\n",
    "\n",
    "arr = np.empty([nreads,200, 4], dtype='int16')\n",
    "arr[0][0:adds.shape[0]] = adds\n",
    "arr[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import glob\n",
    "import os\n",
    "import numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "catg = numpy.load(\"test_pairddrad/test_pairddrad_consens/1B0.catg\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "catg[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cats1 = glob.glob(os.path.join(\n",
    "                      data1.dirs.consens,\n",
    "                      data1.samples['1B0'].name+\"_tmpcats.*\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cats1.sort(key=lambda x: int(x.split(\".\")[-1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "catg = numpy.load(cats1[0])\n",
    "lastg = numpy.load(cats1[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "catg[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "lastg.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Making alignment faster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def alignfast(data, names, seqs):\n",
    "    \"\"\" makes subprocess call to muscle \"\"\"\n",
    "    inputstring = \"\\n\".join(\">\"+i+\"\\n\"+j for i, j in zip(names, seqs))\n",
    "    cmd = \"/bin/echo '\"+inputstring+\"' | \"+data.muscle+\" -quiet -in -\"\n",
    "    piped = subprocess.Popen(cmd, shell=True, \n",
    "                       stdin=subprocess.PIPE,\n",
    "                       stdout=subprocess.PIPE,\n",
    "                       stderr=subprocess.STDOUT,\n",
    "                       close_fds=True)\n",
    "    _, fout = piped.stdin, piped.stdout\n",
    "    return fout.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def alignfast(data, names, seqs):\n",
    "    \"\"\" makes subprocess call to muscle \"\"\"\n",
    "    inputstring = \"\\n\".join(\">\"+i+\"\\n\"+j for i, j in zip(names, seqs))\n",
    "    cmd = \"/bin/echo '\"+inputstring+\"' | \"+data.muscle+\" -quiet -in -\"\n",
    "    piped = subprocess.Popen(shlex.split(cmd), \n",
    "                       stdout=subprocess.PIPE)\n",
    "    return piped.stdout.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%%timeit\n",
    "out = alignfast(data1, names, seqs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def newmuscle(data, names, seqs):\n",
    "    inputstring = \"\\n\".join(\">\"+i+\"\\n\"+j for i, j in zip(names, seqs))\n",
    "    return subprocess.Popen(data.muscle, \n",
    "                            stdin=subprocess.PIPE, \n",
    "                            stdout=subprocess.PIPE)\\\n",
    "                            .communicate(inputstring)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%%timeit \n",
    "newmuscle(data1, names, seqs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def alignfast(data, names, seqs):\n",
    "    \"\"\" makes subprocess call to muscle \"\"\"\n",
    "    inputstring = \"\\n\".join(\">\"+i+\"\\n\"+j for i, j in zip(names, seqs))\n",
    "    cmd = \"/bin/echo '\"+inputstring+\"' | \"+data.muscle+\" -quiet -in -\"\n",
    "    piped = subprocess.check_output(cmd, shell=True)\n",
    "    return piped"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "out = alignfast(data1, names, seqs)\n",
    "print out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%%timeit\n",
    "out = alignfast(data1, names, seqs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "inputstring = \"\\n\".join(i+\"\\n\"+j for i, j in zip(names, seqs))\n",
    "print inputstring"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pipe = subprocess.Popen(shlex.split(cmd), \n",
    "                        stdin=subprocess.PIPE, \n",
    "                        stderr=subprocess.PIPE, stdout=subprocess.PIPE)\n",
    "pipe.stdin.write(\"muscle -h\")\n",
    "get = pipe.communicate(input=pipe)\n",
    "get"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def alignfast2(data, names, seqs):\n",
    "    \"\"\" makes subprocess call to muscle \"\"\"\n",
    "    inputstring = \"\\n\".join(i+\"\\n\"+j for i, j in zip(names, seqs))\n",
    "    cmd = \"/bin/echo '\"+inputstring+\"' | \"+data.muscle+\" -quiet -in -\"\n",
    "    \n",
    "    piped = subprocess.Popen(shlex.split(cmd), \n",
    "                             stdin=subprocess.PIPE, \n",
    "                             stdout=subprocess.PIPE,\n",
    "                             stderr=subprocess.PIPE)\n",
    "    stdin, stdout = piped.communicate(input=cmd)\n",
    "    return stdin, stdout"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def alignfast3(data, names, seqs):\n",
    "    \"\"\" makes subprocess call to muscle \"\"\"\n",
    "    inputstring = \"\\n\".join(i+\"\\n\"+j for i, j in zip(names, seqs))\n",
    "    cmd = \"/bin/echo '\"+inputstring+\"' | \"+data.muscle+\" -quiet -in -\"\n",
    "    \n",
    "    piped = subprocess.check_output(cmd,\n",
    "                             shell=True)\n",
    "    return piped"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def sortalign(stringnames):\n",
    "    \"\"\" parses muscle output from a string to two lists \"\"\"\n",
    "    objs = stringnames.split(\"\\n>\")\n",
    "    seqs = [i.split(\"\\n\")[0].replace(\">\", \"\")+\"\\n\"+\\\n",
    "              \"\".join(i.split('\\n')[1:]) for i in objs]\n",
    "              \n",
    "    aligned = [i.split(\"\\n\") for i in seqs]\n",
    "    newnames = [\">\"+i[0] for i in aligned]\n",
    "    seqs = [i[1] for i in aligned]     \n",
    "    ## return in sorted order by names\n",
    "    sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs), \n",
    "                                         key=lambda pair: pair[0]))]\n",
    "    return sortedtups\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### get clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import gzip\n",
    "import subprocess\n",
    "import shlex\n",
    "infile = gzip.open(\"test_pairddrad/test_pairddrad_clust_0.85/1B0.clust.gz\")\n",
    "clusts = infile.read().split(\"//\\n//\\n\")[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for clust in clusts:\n",
    "    lines = clust.split(\"\\n\")\n",
    "    names = lines[::2]\n",
    "    seqs = lines[1::2]\n",
    "    \n",
    "seqs[0] = list(seqs[0])\n",
    "seqs[0].insert(7, \"AA\")\n",
    "seqs[0] = \"\".join(seqs[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%%timeit\n",
    "alignfast(data1, names, seqs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%%timeit \n",
    "alignfast3(data1, names, seqs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "out = alignfast3(data1, names, seqs)\n",
    "#sorts = sortalign(out)\n",
    "print out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def sortalign(stringnames):\n",
    "    \"\"\" parses muscle output from a string to two lists \"\"\"\n",
    "    objs = stringnames[1:].split(\"\\n>\")\n",
    "    seqs = [i.split(\"\\n\")[0].replace(\">\", \"\")+\"\\n\"+\\\n",
    "              \"\".join(i.split('\\n')[1:]) for i in objs]\n",
    "              \n",
    "    aligned = [i.split(\"\\n\") for i in seqs]\n",
    "    newnames = [\">\"+i[0] for i in aligned]\n",
    "    seqs = [i[1] for i in aligned]  \n",
    "    \n",
    "    ## return in sorted order by names\n",
    "    sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs), \n",
    "                                         key=lambda pair: pair[0]))]\n",
    "    return sortedtups\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def parsemuscle(out):\n",
    "    \"\"\" parse muscle string output into two sorted lists \"\"\"\n",
    "    lines = out[1:].split(\"\\n>\")\n",
    "    names = [line.split(\"\\n\", 1)[0] for line in lines]\n",
    "    seqs = [line.split(\"\\n\", 1)[1].replace(\"\\n\", \"\") for line in lines]\n",
    "    tups = zip(names, seqs)\n",
    "    anames, aseqs = zip(*sorted(tups, key=lambda x: int(x[0].split(\";\")[-1][1:])))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}