{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ipyrad.assemble.clustmap import *\n",
    "from ipyrad.assemble.cluster_across import *\n",
    "import ipyparallel as ipp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "New Assembly: 5-cyatho\n"
     ]
    }
   ],
   "source": [
    "data = ip.Assembly(\"5-cyatho\")\n",
    "data.set_params(\"project_dir\", \"5-cyatho\")\n",
    "data.set_params(\"sorted_fastq_path\", \"./example_empirical_rad/*.gz\")\n",
    "data.set_params(\"filter_adapters\", 2)\n",
    "data.set_params(\"clust_threshold\", .90)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Assembly: 5-cyatho\n",
      "[####################] 100% 0:00:12 | loading reads        | s1 |\n",
      "[####################] 100% 0:04:36 | processing reads     | s2 |\n"
     ]
    }
   ],
   "source": [
    "data.run(\"12\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: \n",
      "            Controller appears to be listening on localhost, but not on this machine.\n",
      "            If this is true, you should specify Client(...,sshserver='you@oud')\n",
      "            or instruct your controller to listen on an external IP.\n",
      "  RuntimeWarning)\n"
     ]
    },
    {
     "ename": "NameError",
     "evalue": "name 'data' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-2-58c66cc4a814>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mipyrad\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massemble\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclustmap\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mipyclient\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mipp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mClient\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0ms3\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStep3\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msamples\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mipyclient\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0msamples\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msamples\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0msample\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msamples\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'data' is not defined"
     ]
    }
   ],
   "source": [
    "from ipyrad.assemble.clustmap import *\n",
    "ipyclient = ipp.Client()\n",
    "s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)\n",
    "samples = list(s3.data.samples.values())\n",
    "sample = samples[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[####################] 100% 0:00:00 | concatenating        | s3 |\n",
      "[####################] 100% 0:00:30 | dereplicating        | s3 |\n",
      "[####################] 100% 0:18:59 | clustering/mapping   | s3 |\n",
      "[####################] 100% 0:00:18 | building clusters    | s3 |\n",
      "[####################] 100% 0:00:01 | chunking clusters    | s3 |\n",
      "[####################] 100% 0:19:32 | aligning clusters    | s3 |\n",
      "[####################] 100% 0:00:39 | concat clusters      | s3 |\n"
     ]
    }
   ],
   "source": [
    "s3.run()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Assembly: 5-cyatho\n",
      "[####################] 100% 0:02:44 | inferring [H, E]     | s4 |\n",
      "[####################] 100% 0:00:06 | calculating depths   | s5 |\n",
      "[####################] 100% 0:00:13 | chunking clusters    | s5 |\n",
      "[####################] 100% 0:11:47 | consens calling      | s5 |\n"
     ]
    }
   ],
   "source": [
    "data.run(\"45\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loading Assembly: 5-cyatho\n",
      "from saved path: ~/Documents/ipyrad/tests/5-cyatho/5-cyatho.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: \n",
      "            Controller appears to be listening on localhost, but not on this machine.\n",
      "            If this is true, you should specify Client(...,sshserver='you@oud')\n",
      "            or instruct your controller to listen on an external IP.\n",
      "  RuntimeWarning)\n"
     ]
    }
   ],
   "source": [
    "data = ip.load_json(\"5-cyatho/5-cyatho.json\")\n",
    "samples = list(data.samples.values())\n",
    "ipyclient = ipp.Client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'o': (0, ['33588_przewalskii', '32082_przewalskii']),\n",
       " 'c': (3,\n",
       "  ['29154_superba',\n",
       "   '30686_cyathophylla',\n",
       "   '41478_cyathophylloides',\n",
       "   '41954_cyathophylloides']),\n",
       " 'r': (3,\n",
       "  ['35236_rex',\n",
       "   '35855_rex',\n",
       "   '38362_rex',\n",
       "   '39618_rex',\n",
       "   '40578_rex',\n",
       "   '30556_thamno',\n",
       "   '33413_thamno'])}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from ipyrad.assemble.cluster_across import *\n",
    "popmins = {'o': 0, 'c': 3, 'r': 3}\n",
    "popdict = {\n",
    "    'o': ['33588_przewalskii', '32082_przewalskii'],\n",
    "    'c': ['29154_superba', '30686_cyathophylla', '41478_cyathophylloides', '41954_cyathophylloides'],\n",
    "    'r': [ '35236_rex', '35855_rex', '38362_rex', '39618_rex', '40578_rex', '30556_thamno', '33413_thamno'],\n",
    "}\n",
    "data._link_populations(popdict, popmins)\n",
    "data.populations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "s6 = Step6(data, samples, ipyclient, 0, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[####################] 100% 0:00:05 | concatenating inputs | s6 |\n"
     ]
    }
   ],
   "source": [
    "s6.remote_build_concats_tier1()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[####################] 100% 0:01:43 | clustering across 1  | s6 |\n"
     ]
    }
   ],
   "source": [
    "s6.remote_cluster1()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[####################] 100% 0:00:01 | concatenating inputs | s6 |\n"
     ]
    }
   ],
   "source": [
    "s6.remote_build_concats_tier2()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[####################] 100% 0:02:27 | clustering 2 | s6 |\n"
     ]
    }
   ],
   "source": [
    "s6.remote_cluster2()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[####################] 100% 0:00:04 | building clusters    | s6 |\n"
     ]
    }
   ],
   "source": [
    "s6.remote_build_denovo_clusters()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[####################] 100% 0:00:11 | aligning clusters    | s6 |\n"
     ]
    }
   ],
   "source": [
    "s6.remote_align_denovo_clusters()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "could not broadcast input array from shape (104) into shape (103)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-30-6668ee2335d1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"./5-cyatho/5-cyatho_across/5-cyatho-tmpalign/5-cyatho.chunk_11760\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign_to_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms6\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms6\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msamples\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-29-a103e0eab7ba>\u001b[0m in \u001b[0;36malign_to_array\u001b[0;34m(data, samples, chunk)\u001b[0m\n\u001b[1;32m     86\u001b[0m             \u001b[0mvarmat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrefbuild\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseqarr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'u1'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPSEUDO_REF\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     87\u001b[0m             \u001b[0mconstmp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvarmat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'S1'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 88\u001b[0;31m             \u001b[0mconsens\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mldx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mconstmp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconstmp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     89\u001b[0m             \u001b[0mvarstmp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnonzero\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvarmat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     90\u001b[0m             \u001b[0mvarpos\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mldx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mvarstmp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvarstmp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: could not broadcast input array from shape (104) into shape (103)"
     ]
    }
   ],
   "source": [
    "chunk = \"./5-cyatho/5-cyatho_across/5-cyatho-tmpalign/5-cyatho.chunk_11760\"\n",
    "align_to_array(s6.data, s6.samples, chunk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "def align_to_array(data, samples, chunk):\n",
    "\n",
    "    # data are already chunked, read in the whole thing\n",
    "    with open(chunk, 'rt') as infile:\n",
    "        clusts = infile.read().split(\"//\\n//\\n\")[:-1]    \n",
    "\n",
    "    # snames to ensure sorted order\n",
    "    samples.sort(key=lambda x: x.name)\n",
    "    snames = [sample.name for sample in samples]\n",
    "\n",
    "    # make a tmparr to store metadata (this can get huge, consider using h5)\n",
    "    maxvar = 25\n",
    "    maxlen = data._hackersonly[\"max_fragment_length\"] + 20\n",
    "    maxinds = data.paramsdict[\"max_Indels_locus\"]\n",
    "    ifilter = np.zeros(len(clusts), dtype=np.bool_)\n",
    "    dfilter = np.zeros(len(clusts), dtype=np.bool_)\n",
    "    varpos = np.zeros((len(clusts), maxvar), dtype='u1')\n",
    "    indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1')\n",
    "    edges = np.zeros((len(clusts), len(samples), 4), dtype='u1')\n",
    "    consens = np.zeros((len(clusts), maxlen), dtype=\"S1\")\n",
    "\n",
    "    # create a persistent shell for running muscle in. \n",
    "    proc = sps.Popen([\"bash\"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0)\n",
    "\n",
    "    # iterate over clusters until finished\n",
    "    allstack = []\n",
    "    for ldx in range(len(clusts)):\n",
    "        istack = []\n",
    "        lines = clusts[ldx].strip().split(\"\\n\")\n",
    "        names = lines[::2]\n",
    "        seqs = lines[1::2]\n",
    "        seqs = [i[:90] for i in seqs]    \n",
    "        align1 = []\n",
    "        \n",
    "        # find duplicates and skip aligning but keep it for downstream.\n",
    "        unames = set([i.rsplit(\"_\", 1)[0] for i in names])\n",
    "        if len(unames) < len(names):\n",
    "            dfilter[ldx] = True\n",
    "            istack = [\"{}\\n{}\".format(i[1:], j) for i, j in zip(names, seqs)]\n",
    "\n",
    "        else:\n",
    "            # append counter to names because muscle doesn't retain order\n",
    "            nnames = [\">{};*{}\".format(j[1:], i) for i, j in enumerate(names)]\n",
    "\n",
    "            # make back into strings\n",
    "            cl1 = \"\\n\".join([\"\\n\".join(i) for i in zip(nnames, seqs)])                \n",
    "\n",
    "            # store allele (lowercase) info\n",
    "            amask, abool = store_alleles(seqs)\n",
    "\n",
    "            # send align1 to the bash shell (TODO: check for pipe-overflow)\n",
    "            cmd1 = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n",
    "                    .format(cl1, ipyrad.bins.muscle, \"//\\n\"))\n",
    "            proc.stdin.write(cmd1.encode())\n",
    "\n",
    "            # read the stdout by line until splitter is reached\n",
    "            for line in iter(proc.stdout.readline, b'//\\n'):\n",
    "                align1.append(line.decode())\n",
    "\n",
    "            # reorder b/c muscle doesn't keep order\n",
    "            lines = \"\".join(align1)[1:].split(\"\\n>\")\n",
    "            dalign1 = dict([i.split(\"\\n\", 1) for i in lines])\n",
    "            keys = sorted(\n",
    "                dalign1.keys(), \n",
    "                key=lambda x: int(x.rsplit(\"*\")[-1])\n",
    "            )\n",
    "            seqarr = np.zeros(\n",
    "                (len(nnames), len(dalign1[keys[0]].replace(\"\\n\", \"\"))),\n",
    "                dtype='S1',\n",
    "                )\n",
    "            for kidx, key in enumerate(keys):\n",
    "                concatseq = dalign1[key].replace(\"\\n\", \"\")\n",
    "                seqarr[kidx] = list(concatseq)\n",
    "                    \n",
    "                # fill in edges for each seq\n",
    "                edg = (\n",
    "                    len(concatseq) - len(concatseq.lstrip('-')),\n",
    "                    len(concatseq.rstrip('-')),\n",
    "                    len(concatseq.rstrip('-')),\n",
    "                    len(concatseq.rstrip('-')),\n",
    "                    )\n",
    "                sidx = snames.index(key.rsplit('_', 1)[0])\n",
    "                edges[ldx, sidx] = edg\n",
    "\n",
    "            # get ref/variant counts in an array\n",
    "            varmat = refbuild(seqarr.view('u1'), PSEUDO_REF)\n",
    "            constmp = varmat[:, 0].view('S1')\n",
    "            consens[ldx, :constmp.size] = constmp\n",
    "            varstmp = np.nonzero(varmat[:, 1])[0]\n",
    "            varpos[ldx, :varstmp.size] = varstmp \n",
    "\n",
    "            # impute allele information back into the read b/c muscle\n",
    "            wkeys = np.argsort([i.rsplit(\"_\", 1)[0] for i in keys])\n",
    "\n",
    "            # sort in sname (alphanumeric) order for indels storage\n",
    "            for idx in wkeys:\n",
    "                # seqarr and amask are in input order here\n",
    "                args = (seqarr, idx, amask)\n",
    "                seqarr[idx], indidx = retrieve_indels_and_alleles(*args)\n",
    "                sidx = snames.index(names[idx].rsplit(\"_\", 1)[0][1:])\n",
    "                if indidx.size:\n",
    "                    indsize = min(indidx.size, sum(maxinds))\n",
    "                    indels[ldx, sidx, :indsize] = indidx[:indsize]\n",
    "                wname = names[idx]\n",
    "\n",
    "                # store (only for visually checking alignments)\n",
    "                istack.append(\n",
    "                    \"{}\\n{}\".format(wname, b\"\".join(seqarr[idx]).decode()))\n",
    "\n",
    "            # indel filter\n",
    "            if indels.sum(axis=1).max() >= sum(maxinds):\n",
    "                ifilter[ldx] = True\n",
    "\n",
    "        # store the stack (only for visually checking alignments)\n",
    "        if istack:\n",
    "            allstack.append(\"\\n\".join(istack))\n",
    "\n",
    "    # cleanup\n",
    "    proc.stdout.close()\n",
    "    if proc.stderr:\n",
    "        proc.stderr.close()\n",
    "    proc.stdin.close()\n",
    "    proc.wait()\n",
    "\n",
    "    # write to file after (only for visually checking alignments)\n",
    "    odx = chunk.rsplit(\"_\")[-1]\n",
    "    alignfile = os.path.join(data.tmpdir, \"aligned_{}.fa\".format(odx))\n",
    "    with open(alignfile, 'wt') as outfile:\n",
    "        outfile.write(\"\\n//\\n//\\n\".join(allstack) + \"\\n\")\n",
    "    #os.remove(chunk)\n",
    "\n",
    "    # save indels array to tmp dir\n",
    "    np.save(\n",
    "        os.path.join(data.tmpdir, \"ifilter_{}.tmp.npy\".format(odx)),\n",
    "        ifilter)\n",
    "    np.save(\n",
    "        os.path.join(data.tmpdir, \"dfilter_{}.tmp.npy\".format(odx)),\n",
    "        dfilter)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loading Assembly: 4-refpairtest\n",
      "from saved path: ~/Documents/ipyrad/tests/4-refpairtest.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: \n",
      "            Controller appears to be listening on localhost, but not on this machine.\n",
      "            If this is true, you should specify Client(...,sshserver='you@oud')\n",
      "            or instruct your controller to listen on an external IP.\n",
      "  RuntimeWarning)\n"
     ]
    }
   ],
   "source": [
    "data = ip.load_json(\"4-refpairtest.json\")\n",
    "samples = list(data.samples.values())\n",
    "sample = samples[0]\n",
    "force = True\n",
    "ipyclient = ipp.Client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'3J_0': <ipyrad.core.sample.Sample at 0x7f498c9ff208>,\n",
       " '1D_0': <ipyrad.core.sample.Sample at 0x7f498ca03780>,\n",
       " '2H_0': <ipyrad.core.sample.Sample at 0x7f498ca03940>,\n",
       " '1B_0': <ipyrad.core.sample.Sample at 0x7f498ca03e48>,\n",
       " '2F_0': <ipyrad.core.sample.Sample at 0x7f498ca03dd8>,\n",
       " '3L_0': <ipyrad.core.sample.Sample at 0x7f498c9ad898>,\n",
       " '2E_0': <ipyrad.core.sample.Sample at 0x7f498ca03e80>,\n",
       " '3I_0': <ipyrad.core.sample.Sample at 0x7f498c9ad438>,\n",
       " '3K_0': <ipyrad.core.sample.Sample at 0x7f498c9ad390>,\n",
       " '2G_0': <ipyrad.core.sample.Sample at 0x7f498c9b87f0>,\n",
       " '1A_0': <ipyrad.core.sample.Sample at 0x7f498c9b8cf8>,\n",
       " '1C_0': <ipyrad.core.sample.Sample at 0x7f498c9b8c88>}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'1': (3, ['1A_0', '1B_0', '1C_0', '1D_0']),\n",
       " '2': (3, ['2E_0', '2F_0', '2G_0', '2H_0']),\n",
       " '3': (3, ['3I_0', '3J_0', '3K_0', '3L_0'])}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "popmins = {'1': 3, '2': 3, '3': 3}\n",
    "popdict = {\n",
    "    '1': ['1A_0', '1B_0', '1C_0', '1D_0'], \n",
    "    '2': ['2E_0', '2F_0', '2G_0', '2H_0'],\n",
    "    '3': ['3I_0', '3J_0', '3K_0', '3L_0'],\n",
    "}\n",
    "\n",
    "data._link_populations(popdict, popmins)\n",
    "data.populations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<ipyrad.core.sample.Sample at 0x7f498c9ff208>,\n",
       " <ipyrad.core.sample.Sample at 0x7f498ca03780>,\n",
       " <ipyrad.core.sample.Sample at 0x7f498ca03940>,\n",
       " <ipyrad.core.sample.Sample at 0x7f498ca03e48>,\n",
       " <ipyrad.core.sample.Sample at 0x7f498ca03dd8>,\n",
       " <ipyrad.core.sample.Sample at 0x7f498c9ad898>,\n",
       " <ipyrad.core.sample.Sample at 0x7f498ca03e80>,\n",
       " <ipyrad.core.sample.Sample at 0x7f498c9ad438>,\n",
       " <ipyrad.core.sample.Sample at 0x7f498c9ad390>,\n",
       " <ipyrad.core.sample.Sample at 0x7f498c9b87f0>,\n",
       " <ipyrad.core.sample.Sample at 0x7f498c9b8cf8>,\n",
       " <ipyrad.core.sample.Sample at 0x7f498c9b8c88>]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = Step6(data, samples, ipyclient, 123, True)\n",
    "s.samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[####################] 100% 0:00:00 | concatenating inputs | s6 |\n"
     ]
    }
   ],
   "source": [
    "s.remote_build_concats_tier1()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[####################] 100% 0:00:02 | clustering across 1  | s6 |\n"
     ]
    }
   ],
   "source": [
    "s.remote_cluster1()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[####################] 100% 0:00:00 | concatenating inputs | s6 |\n"
     ]
    }
   ],
   "source": [
    "s.remote_build_concats_tier2()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[####################] 100% 0:00:00 | clustering 2 | s6 |\n"
     ]
    }
   ],
   "source": [
    "s.remote_cluster2()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[####################] 100% 0:00:00 | building clusters    | s6 |\n"
     ]
    }
   ],
   "source": [
    "s.remote_build_denovo_clusters()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunks = glob.glob('4-refpairtest_across/4-refpairtest-tmpalign/*.chunk*')\n",
    "for chunk in chunks:\n",
    "    align_to_array(s.data, s.samples, chunk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 317,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunk = \"4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_70\"\n",
    "samples = s.samples\n",
    "data = s.data\n",
    "\n",
    "# data are already chunked, read in the whole thing\n",
    "#with open(chunk, 'rt') as infile:\n",
    "#    clusts = infile.read().split(\"//\\n//\\n\")[:-1]    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 389,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['>1A_0_7;*0',\n",
       " '>1B_0_7;*1',\n",
       " '>1D_0_7;*2',\n",
       " '>1C_0_7;*3',\n",
       " '>2E_0_7;*4',\n",
       " '>2F_0_7;*5',\n",
       " '>2H_0_7;*6',\n",
       " '>2G_0_7;*7',\n",
       " '>3I_0_7;*8',\n",
       " '>3J_0_7;*9',\n",
       " '>3L_0_7;*10',\n",
       " '>3K_0_7;*11']"
      ]
     },
     "execution_count": 389,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nnames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 451,
   "metadata": {},
   "outputs": [],
   "source": [
    "def align_to_array(data, samples, chunk):\n",
    "\n",
    "    # data are already chunked, read in the whole thing\n",
    "    with open(chunk, 'rt') as infile:\n",
    "        clusts = infile.read().split(\"//\\n//\\n\")[:-1]    \n",
    "\n",
    "    # snames to ensure sorted order\n",
    "    samples.sort(key=lambda x: x.name)\n",
    "    snames = [sample.name for sample in samples]\n",
    "\n",
    "    # make a tmparr to store metadata (this can get huge, consider using h5)\n",
    "    maxvar = 25\n",
    "    maxlen = data._hackersonly[\"max_fragment_length\"] + 20\n",
    "    maxinds = data.paramsdict[\"max_Indels_locus\"]\n",
    "    ifilter = np.zeros(len(clusts), dtype=np.bool_)\n",
    "    dfilter = np.zeros(len(clusts), dtype=np.bool_)\n",
    "    varpos = np.zeros((len(clusts), maxvar), dtype='u1')\n",
    "    indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1')\n",
    "    edges = np.zeros((len(clusts), len(samples), 4), dtype='u1')\n",
    "    consens = np.zeros((len(clusts), maxlen), dtype=\"S1\")\n",
    "\n",
    "    # create a persistent shell for running muscle in. \n",
    "    proc = sps.Popen([\"bash\"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0)\n",
    "\n",
    "    # iterate over clusters until finished\n",
    "    allstack = []\n",
    "    for ldx in range(len(clusts)):\n",
    "        istack = []\n",
    "        lines = clusts[ldx].strip().split(\"\\n\")\n",
    "        names = lines[::2]\n",
    "        seqs = lines[1::2]        \n",
    "        align1 = []\n",
    "        \n",
    "        # find duplicates and skip aligning/ordering since it will be filtered.\n",
    "        unames = set([i.rsplit(\"_\", 1)[0] for i in names])\n",
    "        if len(unames) < len(names):\n",
    "            dfilter[ldx] = True\n",
    "            istack = [\"{}\\n{}\".format(i[1:], j) for i, j in zip(names, seqs)]\n",
    "\n",
    "        else:\n",
    "            # append counter to names because muscle doesn't retain order\n",
    "            nnames = [\">{};*{}\".format(j[1:], i) for i, j in enumerate(names)]\n",
    "\n",
    "            # make back into strings\n",
    "            cl1 = \"\\n\".join([\"\\n\".join(i) for i in zip(nnames, seqs)])                \n",
    "\n",
    "            # store allele (lowercase) info on seqs\n",
    "            amask, abool = store_alleles(seqs)\n",
    "\n",
    "            # send align1 to the bash shell (TODO: check for pipe-overflow)\n",
    "            cmd1 = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n",
    "                    .format(cl1, ipyrad.bins.muscle, \"//\\n\"))\n",
    "            proc.stdin.write(cmd1.encode())\n",
    "\n",
    "            # read the stdout by line until splitter is reached\n",
    "            for line in iter(proc.stdout.readline, b'//\\n'):\n",
    "                align1.append(line.decode())\n",
    "\n",
    "            # reorder into original order and arrange into a dictionary and arr\n",
    "            lines = \"\".join(align1)[1:].split(\"\\n>\")\n",
    "            dalign1 = dict([i.split(\"\\n\", 1) for i in lines])\n",
    "            keys = sorted(\n",
    "                dalign1.keys(), \n",
    "                key=lambda x: int(x.rsplit(\"*\")[-1])\n",
    "            )\n",
    "            seqarr = np.zeros(\n",
    "                (len(nnames), len(dalign1[keys[0]].replace(\"\\n\", \"\"))),\n",
    "                dtype='S1',\n",
    "                )\n",
    "            for kidx, key in enumerate(keys):\n",
    "                concatseq = dalign1[key].replace(\"\\n\", \"\")\n",
    "                seqarr[kidx] = list(concatseq)\n",
    "                    \n",
    "                # fill in edges for each seq\n",
    "                edg = (\n",
    "                    len(concatseq) - len(concatseq.lstrip('-')),\n",
    "                    len(concatseq.rstrip('-')),\n",
    "                    len(concatseq.rstrip('-')),\n",
    "                    len(concatseq.rstrip('-')),\n",
    "                    )\n",
    "                sidx = snames.index(key.rsplit('_', 1)[0])\n",
    "                edges[ldx, sidx] = edg\n",
    "\n",
    "            # get ref/variant counts in an array\n",
    "            varmat = refbuild(seqarr.view('u1'), PSEUDO_REF)\n",
    "            constmp = varmat[:, 0].view(\"S1\")\n",
    "            consens[ldx, :constmp.size] = constmp\n",
    "            varstmp = np.nonzero(varmat[:, 1])[0]\n",
    "            varpos[ldx, :varstmp.size] = varstmp \n",
    "\n",
    "            # impute allele information back into the read b/c muscle\n",
    "            wkeys = np.argsort([i.rsplit(\"_\", 1)[0] for i in keys])\n",
    "\n",
    "            # sort in sname (alphanumeric) order for indels storage\n",
    "            for idx in wkeys:\n",
    "                # seqarr and amask are in input order here\n",
    "                args = (seqarr, idx, amask)\n",
    "                seqarr[idx], indidx = retrieve_indels_and_alleles(*args)\n",
    "                sidx = snames.index(names[idx].rsplit(\"_\", 1)[0][1:])\n",
    "                if indidx.size:\n",
    "                    indels[sidx, :indidx.size] = indidx\n",
    "                wname = names[idx]\n",
    "                istack.append(\n",
    "                    \"{}\\n{}\".format(wname, b\"\".join(seqarr[idx]).decode()))\n",
    "\n",
    "        # store the stack\n",
    "        if istack:\n",
    "            allstack.append(\"\\n\".join(istack))\n",
    "\n",
    "    # cleanup\n",
    "    proc.stdout.close()\n",
    "    if proc.stderr:\n",
    "        proc.stderr.close()\n",
    "    proc.stdin.close()\n",
    "    proc.wait()\n",
    "\n",
    "    # write to file after\n",
    "    odx = chunk.rsplit(\"_\")[-1]\n",
    "    alignfile = os.path.join(data.tmpdir, \"align_{}.fa\".format(odx))\n",
    "    with open(alignfile, 'wt') as outfile:\n",
    "        outfile.write(\"\\n//\\n//\\n\".join(allstack) + \"\\n\")\n",
    "    #os.remove(chunk)\n",
    "\n",
    "    # save indels array to tmp dir\n",
    "    ifile = os.path.join(data.tmpdir, \"indels_{}.tmp.npy\".format(odx))\n",
    "    np.save(ifile, ifilter)\n",
    "    dfile = os.path.join(data.tmpdir, \"duples_{}.tmp.npy\".format(odx))\n",
    "    np.save(dfile, dfilter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 456,
   "metadata": {},
   "outputs": [],
   "source": [
    "def store_alleles(seqs):\n",
    "    shape = (len(seqs), max([len(i) for i in seqs]))\n",
    "    arrseqs = np.zeros(shape, dtype=np.bytes_)\n",
    "    for row in range(arrseqs.shape[0]):\n",
    "        seqsrow = seqs[row]\n",
    "        arrseqs[row, :len(seqsrow)] = list(seqsrow)\n",
    "    amask = np.char.islower(arrseqs)\n",
    "    if np.any(amask):\n",
    "        return amask, True\n",
    "    else:\n",
    "        return amask, False\n",
    "\n",
    "\n",
    "\n",
    "def retrieve_indels_and_alleles(seqarr, idx, amask):\n",
    "    concatarr = seqarr[idx]\n",
    "    newmask = np.zeros(len(concatarr), dtype=np.bool_)                        \n",
    "\n",
    "    # check for indels and impute to amask\n",
    "    indidx = np.where(concatarr == b\"-\")[0]\n",
    "    if np.sum(amask):\n",
    "        print('yes')\n",
    "        if indidx.size:\n",
    "\n",
    "            # impute for position of variants\n",
    "            allrows = np.arange(amask.shape[1])\n",
    "            mask = np.ones(allrows.shape[0], dtype=np.bool_)\n",
    "            for idx in indidx:\n",
    "                if idx < mask.shape[0]:\n",
    "                    mask[idx] = False\n",
    "            not_idx = allrows[mask == 1]\n",
    "\n",
    "            # fill in new data into all other spots\n",
    "            newmask[not_idx] = amask[idx, :not_idx.shape[0]]\n",
    "\n",
    "        else:\n",
    "            newmask = amask[idx]\n",
    "                \n",
    "        # lower the alleles\n",
    "        concatarr[newmask] = np.char.lower(concatarr[newmask])\n",
    "\n",
    "    return concatarr, indidx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 462,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([False, False, False, False, False, False, False, False, False,\n",
       "       False, False, False, False, False, False, False, False, False,\n",
       "       False, False, False, False, False, False, False, False, False,\n",
       "       False, False, False, False, False, False, False, False])"
      ]
     },
     "execution_count": 462,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.load(\"4-refpairtest_across/4-refpairtest-tmpalign/indels_35.tmp.npy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 457,
   "metadata": {},
   "outputs": [],
   "source": [
    "align_to_array(data, samples, chunk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 459,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunks = glob.glob(\"4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_*\")\n",
    "for chunk in chunks:\n",
    "    align_to_array(data, samples, chunk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 263,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "@numba.jit(nopython=True)\n",
    "def refbuild(iseq, consdict):\n",
    "    \"\"\" Returns the most common base at each site in order. \"\"\"\n",
    "\n",
    "    altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8)\n",
    "\n",
    "    for col in range(iseq.shape[1]):\n",
    "        ## expand colums with ambigs and remove N-\n",
    "        fcounts = np.zeros(111, dtype=np.int64)\n",
    "        counts = np.bincount(iseq[:, col])#, minlength=90)\n",
    "        fcounts[:counts.shape[0]] = counts\n",
    "        ## set N and - to zero, wish numba supported minlen arg\n",
    "        fcounts[78] = 0\n",
    "        fcounts[45] = 0\n",
    "        ## add ambig counts to true bases\n",
    "        for aidx in range(consdict.shape[0]):\n",
    "            nbases = fcounts[consdict[aidx, 0]]\n",
    "            for _ in range(nbases):\n",
    "                fcounts[consdict[aidx, 1]] += 1\n",
    "                fcounts[consdict[aidx, 2]] += 1\n",
    "            fcounts[consdict[aidx, 0]] = 0\n",
    "\n",
    "        ## now get counts from the modified counts arr\n",
    "        who = np.argmax(fcounts)\n",
    "        altrefs[col, 0] = who\n",
    "        fcounts[who] = 0\n",
    "\n",
    "        ## if an alt allele fill over the \".\" placeholder\n",
    "        who = np.argmax(fcounts)\n",
    "        if who:\n",
    "            altrefs[col, 1] = who\n",
    "            fcounts[who] = 0\n",
    "\n",
    "            ## if 3rd or 4th alleles observed then add to arr\n",
    "            who = np.argmax(fcounts)\n",
    "            altrefs[col, 2] = who\n",
    "            fcounts[who] = 0\n",
    "\n",
    "            ## if 3rd or 4th alleles observed then add to arr\n",
    "            who = np.argmax(fcounts)\n",
    "            altrefs[col, 3] = who\n",
    "\n",
    "    return altrefs\n",
    "\n",
    "\n",
    "PSEUDO_REF = np.array([\n",
    "    [82, 71, 65],\n",
    "    [75, 71, 84],\n",
    "    [83, 71, 67],\n",
    "    [89, 84, 67],\n",
    "    [87, 84, 65],\n",
    "    [77, 67, 65],\n",
    "    [78, 78, 78],\n",
    "    [45, 45, 45],\n",
    "    ], dtype=np.uint8)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 274,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[84, 71, 67, ..., 67, 67, 71],\n",
       "       [84, 71, 67, ..., 67, 67, 71],\n",
       "       [84, 71, 67, ..., 67, 67, 71],\n",
       "       ...,\n",
       "       [84, 71, 67, ..., 67, 67, 71],\n",
       "       [84, 71, 67, ..., 67, 67, 71],\n",
       "       [84, 71, 67, ..., 67, 67, 71]], dtype=uint8)"
      ]
     },
     "execution_count": 274,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "seqarr.view(\"u1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3]),\n",
       " array([3, 4, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 5, 6, 7]))"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "aseqs = np.array([\n",
    "    list('ATG--GGA'),\n",
    "    list('A--GGGGA'),\n",
    "    list('---GGGGA'),\n",
    "    list('--------'),\n",
    "])\n",
    "\n",
    "np.where(aseqs == \"-\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [],
   "source": [
    "PSEUDO_REF = np.array([\n",
    "    [82, 71, 65],\n",
    "    [75, 71, 84],\n",
    "    [83, 71, 67],\n",
    "    [89, 84, 67],\n",
    "    [87, 84, 65],\n",
    "    [77, 67, 65],\n",
    "    [78, 78, 78],\n",
    "    [45, 45, 45],\n",
    "    ], dtype=np.uint8)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [],
   "source": [
    "@numba.jit(nopython=True)\n",
    "def reftrick(iseq, consdict):\n",
    "    \"\"\" Returns the most common base at each site in order. \"\"\"\n",
    "\n",
    "    altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8)\n",
    "    #altrefs[:, 1] = 46\n",
    "\n",
    "    for col in range(iseq.shape[1]):\n",
    "        ## expand colums with ambigs and remove N-\n",
    "        fcounts = np.zeros(111, dtype=np.int64)\n",
    "        counts = np.bincount(iseq[:, col])#, minlength=90)\n",
    "        fcounts[:counts.shape[0]] = counts\n",
    "        ## set N and - to zero, wish numba supported minlen arg\n",
    "        fcounts[78] = 0\n",
    "        fcounts[45] = 0\n",
    "        ## add ambig counts to true bases\n",
    "        for aidx in range(consdict.shape[0]):\n",
    "            nbases = fcounts[consdict[aidx, 0]]\n",
    "            for _ in range(nbases):\n",
    "                fcounts[consdict[aidx, 1]] += 1\n",
    "                fcounts[consdict[aidx, 2]] += 1\n",
    "            fcounts[consdict[aidx, 0]] = 0\n",
    "\n",
    "        ## now get counts from the modified counts arr\n",
    "        who = np.argmax(fcounts)\n",
    "        altrefs[col, 0] = who\n",
    "        fcounts[who] = 0\n",
    "\n",
    "        ## if an alt allele fill over the \".\" placeholder\n",
    "        who = np.argmax(fcounts)\n",
    "        if who:\n",
    "            altrefs[col, 1] = who\n",
    "            fcounts[who] = 0\n",
    "\n",
    "            ## if 3rd or 4th alleles observed then add to arr\n",
    "            who = np.argmax(fcounts)\n",
    "            altrefs[col, 2] = who\n",
    "            fcounts[who] = 0\n",
    "\n",
    "            ## if 3rd or 4th alleles observed then add to arr\n",
    "            who = np.argmax(fcounts)\n",
    "            altrefs[col, 3] = who\n",
    "\n",
    "    return altrefs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[b'-', b'-', b'-', b'A', b'A', b'A', b'A', b'-', b'-', b'-', b'-'],\n",
       "       [b'-', b'-', b'-', b'T', b'T', b'A', b'A', b'-', b'-', b'-', b'-'],\n",
       "       [b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A'],\n",
       "       [b'A', b'A', b'T', b'A', b'A', b'A', b'A', b'T', b'A', b'A', b'A']],\n",
       "      dtype='|S1')"
      ]
     },
     "execution_count": 195,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sss = np.array([\n",
    "    list(\"---AAAA----\"),\n",
    "    list(\"---TTAA----\"),\n",
    "    list(\"AAAAAAAAAAA\"),\n",
    "    list(\"AATAAAATAAA\"),\n",
    "], dtype=\"S1\")\n",
    "sss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[65,  0,  0,  0],\n",
       "       [65,  0,  0,  0],\n",
       "       [65, 84,  0,  0],\n",
       "       [65, 84,  0,  0],\n",
       "       [65, 84,  0,  0],\n",
       "       [65,  0,  0,  0],\n",
       "       [65,  0,  0,  0],\n",
       "       [65, 84,  0,  0],\n",
       "       [65,  0,  0,  0],\n",
       "       [65,  0,  0,  0],\n",
       "       [65,  0,  0,  0]], dtype=uint8)"
      ]
     },
     "execution_count": 196,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "varmat = reftrick(sss.view(np.uint8), PSEUDO_REF)\n",
    "varmat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 226,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['>1A_0_7',\n",
       " '>1B_0_7',\n",
       " '>1D_0_7',\n",
       " '>1C_0_7',\n",
       " '>2E_0_7',\n",
       " '>2F_0_7',\n",
       " '>2H_0_7',\n",
       " '>2G_0_7',\n",
       " '>3I_0_7',\n",
       " '>3J_0_7',\n",
       " '>3L_0_7',\n",
       " '>3K_0_7']"
      ]
     },
     "execution_count": 226,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 202,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "31.3 µs ± 764 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "store_alleles(seqs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n",
       " b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n",
       " b'TGCAGCTTCTGAAGCCTTAAATCCTCACGTCAACATGATGCCTWCATGAA',\n",
       " b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n",
       " b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n",
       " b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n",
       " b'TGCAGCTTCTGAAGCCTGCAATCCTCACGTCAATATGATGCCTACATGAA',\n",
       " b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n",
       " b'TGCAGCTYCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n",
       " b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n",
       " b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n",
       " b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA']"
      ]
     },
     "execution_count": 219,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[b\"\".join(i) for i in arrseqs ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 229,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['1A_0',\n",
       " '1B_0',\n",
       " '1C_0',\n",
       " '1D_0',\n",
       " '2E_0',\n",
       " '2F_0',\n",
       " '2G_0',\n",
       " '2H_0',\n",
       " '3I_0',\n",
       " '3J_0',\n",
       " '3K_0',\n",
       " '3L_0']"
      ]
     },
     "execution_count": 229,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "snames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ">1A_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n",
      ">1B_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n",
      ">1C_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n",
      ">1D_0_7 b'TGCAGCTTCTGAAGCCTTAAATCCTCACGTCAACATGATGCCTWCATGAA'\n",
      ">2E_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n",
      ">2F_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n",
      ">2G_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n",
      ">2H_0_7 b'TGCAGCTTCTGAAGCCTGCAATCCTCACGTCAATATGATGCCTACATGAA'\n",
      ">3I_0_7 b'TGCAGCTYCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n",
      ">3J_0_7 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n",
      ">3K_0_7 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n",
      ">3L_0_7 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n"
     ]
    }
   ],
   "source": [
    "widx = np.argsort([i.rsplit(\";\", 1)[0] for i in keys])\n",
    "for i in widx:\n",
    "    print(names[i], b\"\".join(seqarr[i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2, 3, 4, 7])"
      ]
     },
     "execution_count": 184,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "consensus = varmat[:, 0]\n",
    "variants = np.nonzero(varmat[:, 1])[0]\n",
    "variants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        False,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True, False,  True,\n",
       "         True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True, False,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True, False,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True,  True, False,  True,\n",
       "         True, False,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True, False,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True, False,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True, False,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True]])"
      ]
     },
     "execution_count": 148,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "seqarr == seqarr[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
    "for kidx in range(seqarr.shape[0]):\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n",
       "        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n",
       "        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n",
       "        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n",
       "       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n",
       "        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n",
       "        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n",
       "        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n",
       "       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n",
       "        'G', 'C', 'C', 'T', 'T', 'A', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n",
       "        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n",
       "        'G', 'C', 'C', 'T', 'W', 'C', 'A', 'T', 'G', 'A', 'A'],\n",
       "       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n",
       "        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n",
       "        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n",
       "        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n",
       "       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n",
       "        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n",
       "        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n",
       "        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n",
       "       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n",
       "        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n",
       "        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n",
       "        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n",
       "       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n",
       "        'G', 'C', 'C', 'T', 'G', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n",
       "        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'T', 'A', 'T', 'G', 'A', 'T',\n",
       "        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n",
       "       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n",
       "        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n",
       "        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n",
       "        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n",
       "       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'Y', 'C', 'T', 'T', 'A', 'A',\n",
       "        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n",
       "        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n",
       "        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n",
       "       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'T', 'A', 'A',\n",
       "        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n",
       "        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n",
       "        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n",
       "       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'T', 'A', 'A',\n",
       "        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n",
       "        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n",
       "        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n",
       "       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'T', 'A', 'A',\n",
       "        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n",
       "        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n",
       "        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A']],\n",
       "      dtype='<U1')"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# reorder b/c muscle doesn't keep order\n",
    "lines = \"\".join(align1)[1:].split(\"\\n>\")\n",
    "dalign1 = dict([i.split(\"\\n\", 1) for i in lines])\n",
    "keys = sorted(\n",
    "    dalign1.keys(), \n",
    "    key=lambda x: int(x.rsplit(\"*\")[-1])\n",
    ")\n",
    "seqarr = np.zeros(\n",
    "    (len(nnames), len(dalign1[keys[0]])),\n",
    "    dtype='U1',\n",
    "    )\n",
    "for kidx, key in enumerate(keys):\n",
    "    concatseq = dalign1[key].replace(\"\\n\", '')\n",
    "    seqarr[kidx] = list(dalign1[key].replace(\"\\n\", \"\"))\n",
    "\n",
    "seqarr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# fill in edges for each seq\n",
    "edg = (\n",
    "    len(concatseq) - len(concatseq.lstrip('-')),\n",
    "    len(concatseq.rstrip('-')),\n",
    "    len(concatseq.rstrip('-')),\n",
    "    len(concatseq.rstrip('-')),\n",
    "    )             \n",
    "edges[ldx, kidx] = edg\n",
    "\n",
    "edges[ldx, kidx]\n",
    "\n",
    "concatseq[0:50] + 'nnnn'[50:50] + concatseq[50:50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "    seeds = [\n",
    "        os.path.join(\n",
    "            data.dirs.across, \n",
    "            \"{}-{}.htemp\".format(data.name, jobid)) for jobid in jobids\n",
    "        ]\n",
    "    allseeds = os.path.join(data.dirs.across, \"{}-x.fa\".format(data.name))\n",
    "\n",
    "    cmd1 = ['cat'] + seeds\n",
    "    cmd2 = [ipyrad.bins.vsearch, '--sortbylength', '-', '--output', allseeds]\n",
    "    proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)\n",
    "    proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=sps.PIPE, close_fds=True)\n",
    "    out = proc2.communicate()\n",
    "    proc1.stdout.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(b'', None)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = s.data\n",
    "jobid = 2\n",
    "nthreads = 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "    # get files for this jobid\n",
    "    catshuf = os.path.join(\n",
    "        data.dirs.across, \n",
    "        \"{}-{}-catshuf.fa\".format(data.name, jobid))\n",
    "    uhaplos = os.path.join(\n",
    "        data.dirs.across, \n",
    "        \"{}-{}.utemp\".format(data.name, jobid))\n",
    "    hhaplos = os.path.join(\n",
    "        data.dirs.across, \n",
    "        \"{}-{}.htemp\".format(data.name, jobid))\n",
    "    #logfile = os.path.join(data.dirs.across, \"s6_cluster_stats.txt\")\n",
    "\n",
    "    ## parameters that vary by datatype\n",
    "    ## (too low of cov values yield too many poor alignments)\n",
    "    strand = \"plus\"\n",
    "    cov = 0.75         # 0.90\n",
    "    if data.paramsdict[\"datatype\"] in [\"gbs\", \"2brad\"]:\n",
    "        strand = \"both\"\n",
    "        cov = 0.60\n",
    "    elif data.paramsdict[\"datatype\"] == \"pairgbs\":\n",
    "        strand = \"both\"\n",
    "        cov = 0.75     # 0.90\n",
    "\n",
    "    ## nthreads is calculated in 'call_cluster()'\n",
    "    cmd = [ipyrad.bins.vsearch,\n",
    "           \"-cluster_smallmem\", catshuf,\n",
    "           \"-strand\", strand,\n",
    "           \"-query_cov\", str(cov),\n",
    "           \"-minsl\", str(0.5),\n",
    "           \"-id\", str(data.paramsdict[\"clust_threshold\"]),\n",
    "           \"-userout\", uhaplos,\n",
    "           \"-notmatched\", hhaplos,\n",
    "           \"-userfields\", \"query+target+qstrand\",\n",
    "           \"-maxaccepts\", \"1\",\n",
    "           \"-maxrejects\", \"0\",\n",
    "           \"-fasta_width\", \"0\",\n",
    "           \"-threads\", str(nthreads),  # \"0\",\n",
    "           \"-fulldp\",\n",
    "           \"-usersort\",\n",
    "           ]\n",
    "    proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)\n",
    "    out = proc.communicate()\n",
    "    if proc.returncode:\n",
    "        raise IPyradError(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)\n",
    "out = proc.communicate()\n",
    "if proc.returncode:\n",
    "    raise IPyradError(out)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "build_concat_files(s.data, 1, [s.data.samples[i] for i in s.cgroups[1]], 123)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = s.data\n",
    "jobid = 1\n",
    "samples = [s.data.samples[i] for i in s.cgroups[1]]\n",
    "randomseed = 123"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "conshandles = [\n",
    "    sample.files.consens[0] for sample in samples if \n",
    "    sample.stats.reads_consens]\n",
    "conshandles.sort()\n",
    "assert conshandles, \"no consensus files found\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(None, None)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "    ## concatenate all of the gzipped consens files\n",
    "    cmd = ['cat'] + conshandles\n",
    "    groupcons = os.path.join(\n",
    "        data.dirs.across, \n",
    "        \"{}-{}-catcons.gz\".format(data.name, jobid))\n",
    "    LOGGER.debug(\" \".join(cmd))\n",
    "    with open(groupcons, 'w') as output:\n",
    "        call = sps.Popen(cmd, stdout=output, close_fds=True)\n",
    "        call.communicate()\n",
    "\n",
    "    ## a string of sed substitutions for temporarily replacing hetero sites\n",
    "    ## skips lines with '>', so it doesn't affect taxon names\n",
    "    subs = [\"/>/!s/W/A/g\", \"/>/!s/w/A/g\", \"/>/!s/R/A/g\", \"/>/!s/r/A/g\",\n",
    "            \"/>/!s/M/A/g\", \"/>/!s/m/A/g\", \"/>/!s/K/T/g\", \"/>/!s/k/T/g\",\n",
    "            \"/>/!s/S/C/g\", \"/>/!s/s/C/g\", \"/>/!s/Y/C/g\", \"/>/!s/y/C/g\"]\n",
    "    subs = \";\".join(subs)\n",
    "\n",
    "    ## impute pseudo-haplo information to avoid mismatch at hetero sites\n",
    "    ## the read data with hetero sites is put back into clustered data later.\n",
    "    ## pipe passed data from gunzip to sed.\n",
    "    cmd1 = [\"gunzip\", \"-c\", groupcons]\n",
    "    cmd2 = [\"sed\", subs]\n",
    "    LOGGER.debug(\" \".join(cmd1))\n",
    "    LOGGER.debug(\" \".join(cmd2))\n",
    "\n",
    "    proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)\n",
    "    allhaps = groupcons.replace(\"-catcons.gz\", \"-cathaps.fa\")\n",
    "    with open(allhaps, 'w') as output:\n",
    "        proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)\n",
    "        proc2.communicate()\n",
    "    proc1.stdout.close()\n",
    "\n",
    "    ## now sort the file using vsearch\n",
    "    allsort = groupcons.replace(\"-catcons.gz\", \"-catsort.fa\")\n",
    "    cmd1 = [ipyrad.bins.vsearch,\n",
    "            \"--sortbylength\", allhaps,\n",
    "            \"--fasta_width\", \"0\",\n",
    "            \"--output\", allsort]\n",
    "    LOGGER.debug(\" \".join(cmd1))\n",
    "    proc1 = sps.Popen(cmd1, close_fds=True)\n",
    "    proc1.communicate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "''"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.dirs.across"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate the theading of cluster1 jobs:\n",
    "ncpus = len(self.ipyclient.ids)\n",
    "njobs = len(self.cgroups)\n",
    "nnodes = len(self.hostd)\n",
    "minthreads = 2\n",
    "maxthreads = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# product\n",
    "targets = [0, 1]\n",
    "self.thview = self.ipyclient.load_balanced_view(targets=targets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[####################] 100% 0:00:02 | concatenating inputs | s6 |\n"
     ]
    }
   ],
   "source": [
    "s.prepare_concats()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "ename": "IPyradError",
     "evalue": "(b'vsearch v2.8.0_linux_x86_64, 15.5GB RAM, 4 cores\\nhttps://github.com/torognes/vsearch\\n\\n\\n\\nUnable to open file for reading (/home/deren/Documents/ipyrad/tests/4-refpairtest_across/4-refpairtest-2-catshuf.fa)\\n', None)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIPyradError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-31-38c961b35fd2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcluster1\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-30-0187c2dc7daf>\u001b[0m in \u001b[0;36mcluster1\u001b[0;34m(data, jobid, nthreads)\u001b[0m\n\u001b[1;32m     45\u001b[0m     \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommunicate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     46\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mproc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mIPyradError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mIPyradError\u001b[0m: (b'vsearch v2.8.0_linux_x86_64, 15.5GB RAM, 4 cores\\nhttps://github.com/torognes/vsearch\\n\\n\\n\\nUnable to open file for reading (/home/deren/Documents/ipyrad/tests/4-refpairtest_across/4-refpairtest-2-catshuf.fa)\\n', None)"
     ]
    }
   ],
   "source": [
    "cluster1(data, 2, 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "def cluster1(data, jobid, nthreads):\n",
    "\n",
    "    # get files for this jobid\n",
    "    catshuf = os.path.join(\n",
    "        data.dirs.across, \n",
    "        \"{}-{}-catshuf.fa\".format(data.name, jobid))\n",
    "    uhaplos = os.path.join(\n",
    "        data.dirs.across, \n",
    "        \"{}-{}.utemp\".format(data.name, jobid))\n",
    "    hhaplos = os.path.join(\n",
    "        data.dirs.across, \n",
    "        \"{}-{}.htemp\".format(data.name, jobid))\n",
    "\n",
    "    ## parameters that vary by datatype\n",
    "    ## (too low of cov values yield too many poor alignments)\n",
    "    strand = \"plus\"\n",
    "    cov = 0.75         # 0.90\n",
    "    if data.paramsdict[\"datatype\"] in [\"gbs\", \"2brad\"]:\n",
    "        strand = \"both\"\n",
    "        cov = 0.60\n",
    "    elif data.paramsdict[\"datatype\"] == \"pairgbs\":\n",
    "        strand = \"both\"\n",
    "        cov = 0.75     # 0.90\n",
    "\n",
    "    ## nthreads is calculated in 'call_cluster()'\n",
    "    cmd = [ipyrad.bins.vsearch,\n",
    "           \"-cluster_smallmem\", catshuf,\n",
    "           \"-strand\", strand,\n",
    "           \"-query_cov\", str(cov),\n",
    "           \"-minsl\", str(0.5),\n",
    "           \"-id\", str(data.paramsdict[\"clust_threshold\"]),\n",
    "           \"-userout\", uhaplos,\n",
    "           \"-notmatched\", hhaplos,\n",
    "           \"-userfields\", \"query+target+qstrand\",\n",
    "           \"-maxaccepts\", \"1\",\n",
    "           \"-maxrejects\", \"0\",\n",
    "           \"-fasta_width\", \"0\",\n",
    "           \"-threads\", str(nthreads),  # \"0\",\n",
    "           \"-fulldp\",\n",
    "           \"-usersort\",\n",
    "           ]\n",
    "    proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)\n",
    "    out = proc.communicate()\n",
    "    if proc.returncode:\n",
    "        raise IPyradError(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "if len(self.samples) < 50:\n",
    "    nclusters = 1\n",
    "elif len(self.samples) < 100:\n",
    "    nclusters = 2\n",
    "elif len(self.samples) < 200:\n",
    "    nclusters = 4\n",
    "elif len(self.samples) > 500:\n",
    "    nclusters = 10\n",
    "else:\n",
    "    nclusters = int(len(self.samples) / 10)\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: ['1A_0', '1B_0', '1C_0', '1D_0'],\n",
       " 1: ['2E_0', '2F_0', '2G_0', '2H_0'],\n",
       " 2: ['3I_0', '3J_0', '3K_0', '3L_0']}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "self.cgroups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'oud': [0, 1, 2, 3]}"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hosts\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}