{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.clustmap import *\n", "from ipyrad.assemble.cluster_across import *\n", "import ipyparallel as ipp" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New Assembly: 5-cyatho\n" ] } ], "source": [ "data = ip.Assembly(\"5-cyatho\")\n", "data.set_params(\"project_dir\", \"5-cyatho\")\n", "data.set_params(\"sorted_fastq_path\", \"./example_empirical_rad/*.gz\")\n", "data.set_params(\"filter_adapters\", 2)\n", "data.set_params(\"clust_threshold\", .90)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: 5-cyatho\n", "[####################] 100% 0:00:12 | loading reads | s1 |\n", "[####################] 100% 0:04:36 | processing reads | s2 |\n" ] } ], "source": [ "data.run(\"12\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: \n", " Controller appears to be listening on localhost, but not on this machine.\n", " If this is true, you should specify Client(...,sshserver='you@oud')\n", " or instruct your controller to listen on an external IP.\n", " RuntimeWarning)\n" ] }, { "ename": "NameError", "evalue": "name 'data' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mipyrad\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massemble\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclustmap\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mipyclient\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mipp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mClient\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0ms3\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStep3\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msamples\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mipyclient\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0msamples\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msamples\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0msample\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msamples\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'data' is not defined" ] } ], "source": [ "from ipyrad.assemble.clustmap import *\n", "ipyclient = ipp.Client()\n", "s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)\n", "samples = list(s3.data.samples.values())\n", "sample = samples[1]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | concatenating | s3 |\n", "[####################] 100% 0:00:30 | dereplicating | s3 |\n", "[####################] 100% 0:18:59 | clustering/mapping | s3 |\n", "[####################] 100% 0:00:18 | building clusters | s3 |\n", "[####################] 100% 0:00:01 | chunking clusters | s3 |\n", "[####################] 100% 0:19:32 | aligning clusters | s3 |\n", "[####################] 100% 0:00:39 | concat clusters | s3 |\n" ] } ], "source": [ "s3.run()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: 5-cyatho\n", "[####################] 100% 0:02:44 | inferring [H, E] | s4 |\n", "[####################] 100% 0:00:06 | calculating depths | s5 |\n", "[####################] 100% 0:00:13 | chunking clusters | s5 |\n", "[####################] 100% 0:11:47 | consens calling | s5 |\n" ] } ], "source": [ "data.run(\"45\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading Assembly: 5-cyatho\n", "from saved path: ~/Documents/ipyrad/tests/5-cyatho/5-cyatho.json\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: \n", " Controller appears to be listening on localhost, but not on this machine.\n", " If this is true, you should specify Client(...,sshserver='you@oud')\n", " or instruct your controller to listen on an external IP.\n", " RuntimeWarning)\n" ] } ], "source": [ "data = ip.load_json(\"5-cyatho/5-cyatho.json\")\n", "samples = list(data.samples.values())\n", "ipyclient = ipp.Client()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'o': (0, ['33588_przewalskii', '32082_przewalskii']),\n", " 'c': (3,\n", " ['29154_superba',\n", " '30686_cyathophylla',\n", " '41478_cyathophylloides',\n", " '41954_cyathophylloides']),\n", " 'r': (3,\n", " ['35236_rex',\n", " '35855_rex',\n", " '38362_rex',\n", " '39618_rex',\n", " '40578_rex',\n", " '30556_thamno',\n", " '33413_thamno'])}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from ipyrad.assemble.cluster_across import *\n", "popmins = {'o': 0, 'c': 3, 'r': 3}\n", "popdict = {\n", " 'o': ['33588_przewalskii', '32082_przewalskii'],\n", " 'c': ['29154_superba', '30686_cyathophylla', '41478_cyathophylloides', '41954_cyathophylloides'],\n", " 'r': [ '35236_rex', '35855_rex', '38362_rex', '39618_rex', '40578_rex', '30556_thamno', '33413_thamno'],\n", "}\n", "data._link_populations(popdict, popmins)\n", "data.populations" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "s6 = Step6(data, samples, ipyclient, 0, 0)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:05 | concatenating inputs | s6 |\n" ] } ], "source": [ "s6.remote_build_concats_tier1()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:01:43 | clustering across 1 | s6 |\n" ] } ], "source": [ "s6.remote_cluster1()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:01 | concatenating inputs | s6 |\n" ] } ], "source": [ "s6.remote_build_concats_tier2()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:02:27 | clustering 2 | s6 |\n" ] } ], "source": [ "s6.remote_cluster2()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:04 | building clusters | s6 |\n" ] } ], "source": [ "s6.remote_build_denovo_clusters()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:11 | aligning clusters | s6 |\n" ] } ], "source": [ "s6.remote_align_denovo_clusters()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "could not broadcast input array from shape (104) into shape (103)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"./5-cyatho/5-cyatho_across/5-cyatho-tmpalign/5-cyatho.chunk_11760\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign_to_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms6\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms6\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msamples\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m\u001b[0m in \u001b[0;36malign_to_array\u001b[0;34m(data, samples, chunk)\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0mvarmat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrefbuild\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseqarr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'u1'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPSEUDO_REF\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0mconstmp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvarmat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'S1'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 88\u001b[0;31m \u001b[0mconsens\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mldx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mconstmp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconstmp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 89\u001b[0m \u001b[0mvarstmp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnonzero\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvarmat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0mvarpos\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mldx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mvarstmp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvarstmp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: could not broadcast input array from shape (104) into shape (103)" ] } ], "source": [ "chunk = \"./5-cyatho/5-cyatho_across/5-cyatho-tmpalign/5-cyatho.chunk_11760\"\n", "align_to_array(s6.data, s6.samples, chunk)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "def align_to_array(data, samples, chunk):\n", "\n", " # data are already chunked, read in the whole thing\n", " with open(chunk, 'rt') as infile:\n", " clusts = infile.read().split(\"//\\n//\\n\")[:-1] \n", "\n", " # snames to ensure sorted order\n", " samples.sort(key=lambda x: x.name)\n", " snames = [sample.name for sample in samples]\n", "\n", " # make a tmparr to store metadata (this can get huge, consider using h5)\n", " maxvar = 25\n", " maxlen = data._hackersonly[\"max_fragment_length\"] + 20\n", " maxinds = data.paramsdict[\"max_Indels_locus\"]\n", " ifilter = np.zeros(len(clusts), dtype=np.bool_)\n", " dfilter = np.zeros(len(clusts), dtype=np.bool_)\n", " varpos = np.zeros((len(clusts), maxvar), dtype='u1')\n", " indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1')\n", " edges = np.zeros((len(clusts), len(samples), 4), dtype='u1')\n", " consens = np.zeros((len(clusts), maxlen), dtype=\"S1\")\n", "\n", " # create a persistent shell for running muscle in. \n", " proc = sps.Popen([\"bash\"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0)\n", "\n", " # iterate over clusters until finished\n", " allstack = []\n", " for ldx in range(len(clusts)):\n", " istack = []\n", " lines = clusts[ldx].strip().split(\"\\n\")\n", " names = lines[::2]\n", " seqs = lines[1::2]\n", " seqs = [i[:90] for i in seqs] \n", " align1 = []\n", " \n", " # find duplicates and skip aligning but keep it for downstream.\n", " unames = set([i.rsplit(\"_\", 1)[0] for i in names])\n", " if len(unames) < len(names):\n", " dfilter[ldx] = True\n", " istack = [\"{}\\n{}\".format(i[1:], j) for i, j in zip(names, seqs)]\n", "\n", " else:\n", " # append counter to names because muscle doesn't retain order\n", " nnames = [\">{};*{}\".format(j[1:], i) for i, j in enumerate(names)]\n", "\n", " # make back into strings\n", " cl1 = \"\\n\".join([\"\\n\".join(i) for i in zip(nnames, seqs)]) \n", "\n", " # store allele (lowercase) info\n", " amask, abool = store_alleles(seqs)\n", "\n", " # send align1 to the bash shell (TODO: check for pipe-overflow)\n", " cmd1 = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n", " .format(cl1, ipyrad.bins.muscle, \"//\\n\"))\n", " proc.stdin.write(cmd1.encode())\n", "\n", " # read the stdout by line until splitter is reached\n", " for line in iter(proc.stdout.readline, b'//\\n'):\n", " align1.append(line.decode())\n", "\n", " # reorder b/c muscle doesn't keep order\n", " lines = \"\".join(align1)[1:].split(\"\\n>\")\n", " dalign1 = dict([i.split(\"\\n\", 1) for i in lines])\n", " keys = sorted(\n", " dalign1.keys(), \n", " key=lambda x: int(x.rsplit(\"*\")[-1])\n", " )\n", " seqarr = np.zeros(\n", " (len(nnames), len(dalign1[keys[0]].replace(\"\\n\", \"\"))),\n", " dtype='S1',\n", " )\n", " for kidx, key in enumerate(keys):\n", " concatseq = dalign1[key].replace(\"\\n\", \"\")\n", " seqarr[kidx] = list(concatseq)\n", " \n", " # fill in edges for each seq\n", " edg = (\n", " len(concatseq) - len(concatseq.lstrip('-')),\n", " len(concatseq.rstrip('-')),\n", " len(concatseq.rstrip('-')),\n", " len(concatseq.rstrip('-')),\n", " )\n", " sidx = snames.index(key.rsplit('_', 1)[0])\n", " edges[ldx, sidx] = edg\n", "\n", " # get ref/variant counts in an array\n", " varmat = refbuild(seqarr.view('u1'), PSEUDO_REF)\n", " constmp = varmat[:, 0].view('S1')\n", " consens[ldx, :constmp.size] = constmp\n", " varstmp = np.nonzero(varmat[:, 1])[0]\n", " varpos[ldx, :varstmp.size] = varstmp \n", "\n", " # impute allele information back into the read b/c muscle\n", " wkeys = np.argsort([i.rsplit(\"_\", 1)[0] for i in keys])\n", "\n", " # sort in sname (alphanumeric) order for indels storage\n", " for idx in wkeys:\n", " # seqarr and amask are in input order here\n", " args = (seqarr, idx, amask)\n", " seqarr[idx], indidx = retrieve_indels_and_alleles(*args)\n", " sidx = snames.index(names[idx].rsplit(\"_\", 1)[0][1:])\n", " if indidx.size:\n", " indsize = min(indidx.size, sum(maxinds))\n", " indels[ldx, sidx, :indsize] = indidx[:indsize]\n", " wname = names[idx]\n", "\n", " # store (only for visually checking alignments)\n", " istack.append(\n", " \"{}\\n{}\".format(wname, b\"\".join(seqarr[idx]).decode()))\n", "\n", " # indel filter\n", " if indels.sum(axis=1).max() >= sum(maxinds):\n", " ifilter[ldx] = True\n", "\n", " # store the stack (only for visually checking alignments)\n", " if istack:\n", " allstack.append(\"\\n\".join(istack))\n", "\n", " # cleanup\n", " proc.stdout.close()\n", " if proc.stderr:\n", " proc.stderr.close()\n", " proc.stdin.close()\n", " proc.wait()\n", "\n", " # write to file after (only for visually checking alignments)\n", " odx = chunk.rsplit(\"_\")[-1]\n", " alignfile = os.path.join(data.tmpdir, \"aligned_{}.fa\".format(odx))\n", " with open(alignfile, 'wt') as outfile:\n", " outfile.write(\"\\n//\\n//\\n\".join(allstack) + \"\\n\")\n", " #os.remove(chunk)\n", "\n", " # save indels array to tmp dir\n", " np.save(\n", " os.path.join(data.tmpdir, \"ifilter_{}.tmp.npy\".format(odx)),\n", " ifilter)\n", " np.save(\n", " os.path.join(data.tmpdir, \"dfilter_{}.tmp.npy\".format(odx)),\n", " dfilter)\n", "\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading Assembly: 4-refpairtest\n", "from saved path: ~/Documents/ipyrad/tests/4-refpairtest.json\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: \n", " Controller appears to be listening on localhost, but not on this machine.\n", " If this is true, you should specify Client(...,sshserver='you@oud')\n", " or instruct your controller to listen on an external IP.\n", " RuntimeWarning)\n" ] } ], "source": [ "data = ip.load_json(\"4-refpairtest.json\")\n", "samples = list(data.samples.values())\n", "sample = samples[0]\n", "force = True\n", "ipyclient = ipp.Client()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'3J_0': ,\n", " '1D_0': ,\n", " '2H_0': ,\n", " '1B_0': ,\n", " '2F_0': ,\n", " '3L_0': ,\n", " '2E_0': ,\n", " '3I_0': ,\n", " '3K_0': ,\n", " '2G_0': ,\n", " '1A_0': ,\n", " '1C_0': }" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.samples" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'1': (3, ['1A_0', '1B_0', '1C_0', '1D_0']),\n", " '2': (3, ['2E_0', '2F_0', '2G_0', '2H_0']),\n", " '3': (3, ['3I_0', '3J_0', '3K_0', '3L_0'])}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "popmins = {'1': 3, '2': 3, '3': 3}\n", "popdict = {\n", " '1': ['1A_0', '1B_0', '1C_0', '1D_0'], \n", " '2': ['2E_0', '2F_0', '2G_0', '2H_0'],\n", " '3': ['3I_0', '3J_0', '3K_0', '3L_0'],\n", "}\n", "\n", "data._link_populations(popdict, popmins)\n", "data.populations" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = Step6(data, samples, ipyclient, 123, True)\n", "s.samples" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | concatenating inputs | s6 |\n" ] } ], "source": [ "s.remote_build_concats_tier1()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:02 | clustering across 1 | s6 |\n" ] } ], "source": [ "s.remote_cluster1()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | concatenating inputs | s6 |\n" ] } ], "source": [ "s.remote_build_concats_tier2()\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | clustering 2 | s6 |\n" ] } ], "source": [ "s.remote_cluster2()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | building clusters | s6 |\n" ] } ], "source": [ "s.remote_build_denovo_clusters()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "chunks = glob.glob('4-refpairtest_across/4-refpairtest-tmpalign/*.chunk*')\n", "for chunk in chunks:\n", " align_to_array(s.data, s.samples, chunk)" ] }, { "cell_type": "code", "execution_count": 317, "metadata": {}, "outputs": [], "source": [ "chunk = \"4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_70\"\n", "samples = s.samples\n", "data = s.data\n", "\n", "# data are already chunked, read in the whole thing\n", "#with open(chunk, 'rt') as infile:\n", "# clusts = infile.read().split(\"//\\n//\\n\")[:-1] " ] }, { "cell_type": "code", "execution_count": 389, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['>1A_0_7;*0',\n", " '>1B_0_7;*1',\n", " '>1D_0_7;*2',\n", " '>1C_0_7;*3',\n", " '>2E_0_7;*4',\n", " '>2F_0_7;*5',\n", " '>2H_0_7;*6',\n", " '>2G_0_7;*7',\n", " '>3I_0_7;*8',\n", " '>3J_0_7;*9',\n", " '>3L_0_7;*10',\n", " '>3K_0_7;*11']" ] }, "execution_count": 389, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nnames" ] }, { "cell_type": "code", "execution_count": 451, "metadata": {}, "outputs": [], "source": [ "def align_to_array(data, samples, chunk):\n", "\n", " # data are already chunked, read in the whole thing\n", " with open(chunk, 'rt') as infile:\n", " clusts = infile.read().split(\"//\\n//\\n\")[:-1] \n", "\n", " # snames to ensure sorted order\n", " samples.sort(key=lambda x: x.name)\n", " snames = [sample.name for sample in samples]\n", "\n", " # make a tmparr to store metadata (this can get huge, consider using h5)\n", " maxvar = 25\n", " maxlen = data._hackersonly[\"max_fragment_length\"] + 20\n", " maxinds = data.paramsdict[\"max_Indels_locus\"]\n", " ifilter = np.zeros(len(clusts), dtype=np.bool_)\n", " dfilter = np.zeros(len(clusts), dtype=np.bool_)\n", " varpos = np.zeros((len(clusts), maxvar), dtype='u1')\n", " indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1')\n", " edges = np.zeros((len(clusts), len(samples), 4), dtype='u1')\n", " consens = np.zeros((len(clusts), maxlen), dtype=\"S1\")\n", "\n", " # create a persistent shell for running muscle in. \n", " proc = sps.Popen([\"bash\"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0)\n", "\n", " # iterate over clusters until finished\n", " allstack = []\n", " for ldx in range(len(clusts)):\n", " istack = []\n", " lines = clusts[ldx].strip().split(\"\\n\")\n", " names = lines[::2]\n", " seqs = lines[1::2] \n", " align1 = []\n", " \n", " # find duplicates and skip aligning/ordering since it will be filtered.\n", " unames = set([i.rsplit(\"_\", 1)[0] for i in names])\n", " if len(unames) < len(names):\n", " dfilter[ldx] = True\n", " istack = [\"{}\\n{}\".format(i[1:], j) for i, j in zip(names, seqs)]\n", "\n", " else:\n", " # append counter to names because muscle doesn't retain order\n", " nnames = [\">{};*{}\".format(j[1:], i) for i, j in enumerate(names)]\n", "\n", " # make back into strings\n", " cl1 = \"\\n\".join([\"\\n\".join(i) for i in zip(nnames, seqs)]) \n", "\n", " # store allele (lowercase) info on seqs\n", " amask, abool = store_alleles(seqs)\n", "\n", " # send align1 to the bash shell (TODO: check for pipe-overflow)\n", " cmd1 = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n", " .format(cl1, ipyrad.bins.muscle, \"//\\n\"))\n", " proc.stdin.write(cmd1.encode())\n", "\n", " # read the stdout by line until splitter is reached\n", " for line in iter(proc.stdout.readline, b'//\\n'):\n", " align1.append(line.decode())\n", "\n", " # reorder into original order and arrange into a dictionary and arr\n", " lines = \"\".join(align1)[1:].split(\"\\n>\")\n", " dalign1 = dict([i.split(\"\\n\", 1) for i in lines])\n", " keys = sorted(\n", " dalign1.keys(), \n", " key=lambda x: int(x.rsplit(\"*\")[-1])\n", " )\n", " seqarr = np.zeros(\n", " (len(nnames), len(dalign1[keys[0]].replace(\"\\n\", \"\"))),\n", " dtype='S1',\n", " )\n", " for kidx, key in enumerate(keys):\n", " concatseq = dalign1[key].replace(\"\\n\", \"\")\n", " seqarr[kidx] = list(concatseq)\n", " \n", " # fill in edges for each seq\n", " edg = (\n", " len(concatseq) - len(concatseq.lstrip('-')),\n", " len(concatseq.rstrip('-')),\n", " len(concatseq.rstrip('-')),\n", " len(concatseq.rstrip('-')),\n", " )\n", " sidx = snames.index(key.rsplit('_', 1)[0])\n", " edges[ldx, sidx] = edg\n", "\n", " # get ref/variant counts in an array\n", " varmat = refbuild(seqarr.view('u1'), PSEUDO_REF)\n", " constmp = varmat[:, 0].view(\"S1\")\n", " consens[ldx, :constmp.size] = constmp\n", " varstmp = np.nonzero(varmat[:, 1])[0]\n", " varpos[ldx, :varstmp.size] = varstmp \n", "\n", " # impute allele information back into the read b/c muscle\n", " wkeys = np.argsort([i.rsplit(\"_\", 1)[0] for i in keys])\n", "\n", " # sort in sname (alphanumeric) order for indels storage\n", " for idx in wkeys:\n", " # seqarr and amask are in input order here\n", " args = (seqarr, idx, amask)\n", " seqarr[idx], indidx = retrieve_indels_and_alleles(*args)\n", " sidx = snames.index(names[idx].rsplit(\"_\", 1)[0][1:])\n", " if indidx.size:\n", " indels[sidx, :indidx.size] = indidx\n", " wname = names[idx]\n", " istack.append(\n", " \"{}\\n{}\".format(wname, b\"\".join(seqarr[idx]).decode()))\n", "\n", " # store the stack\n", " if istack:\n", " allstack.append(\"\\n\".join(istack))\n", "\n", " # cleanup\n", " proc.stdout.close()\n", " if proc.stderr:\n", " proc.stderr.close()\n", " proc.stdin.close()\n", " proc.wait()\n", "\n", " # write to file after\n", " odx = chunk.rsplit(\"_\")[-1]\n", " alignfile = os.path.join(data.tmpdir, \"align_{}.fa\".format(odx))\n", " with open(alignfile, 'wt') as outfile:\n", " outfile.write(\"\\n//\\n//\\n\".join(allstack) + \"\\n\")\n", " #os.remove(chunk)\n", "\n", " # save indels array to tmp dir\n", " ifile = os.path.join(data.tmpdir, \"indels_{}.tmp.npy\".format(odx))\n", " np.save(ifile, ifilter)\n", " dfile = os.path.join(data.tmpdir, \"duples_{}.tmp.npy\".format(odx))\n", " np.save(dfile, dfilter)" ] }, { "cell_type": "code", "execution_count": 456, "metadata": {}, "outputs": [], "source": [ "def store_alleles(seqs):\n", " shape = (len(seqs), max([len(i) for i in seqs]))\n", " arrseqs = np.zeros(shape, dtype=np.bytes_)\n", " for row in range(arrseqs.shape[0]):\n", " seqsrow = seqs[row]\n", " arrseqs[row, :len(seqsrow)] = list(seqsrow)\n", " amask = np.char.islower(arrseqs)\n", " if np.any(amask):\n", " return amask, True\n", " else:\n", " return amask, False\n", "\n", "\n", "\n", "def retrieve_indels_and_alleles(seqarr, idx, amask):\n", " concatarr = seqarr[idx]\n", " newmask = np.zeros(len(concatarr), dtype=np.bool_) \n", "\n", " # check for indels and impute to amask\n", " indidx = np.where(concatarr == b\"-\")[0]\n", " if np.sum(amask):\n", " print('yes')\n", " if indidx.size:\n", "\n", " # impute for position of variants\n", " allrows = np.arange(amask.shape[1])\n", " mask = np.ones(allrows.shape[0], dtype=np.bool_)\n", " for idx in indidx:\n", " if idx < mask.shape[0]:\n", " mask[idx] = False\n", " not_idx = allrows[mask == 1]\n", "\n", " # fill in new data into all other spots\n", " newmask[not_idx] = amask[idx, :not_idx.shape[0]]\n", "\n", " else:\n", " newmask = amask[idx]\n", " \n", " # lower the alleles\n", " concatarr[newmask] = np.char.lower(concatarr[newmask])\n", "\n", " return concatarr, indidx" ] }, { "cell_type": "code", "execution_count": 462, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([False, False, False, False, False, False, False, False, False,\n", " False, False, False, False, False, False, False, False, False,\n", " False, False, False, False, False, False, False, False, False,\n", " False, False, False, False, False, False, False, False])" ] }, "execution_count": 462, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.load(\"4-refpairtest_across/4-refpairtest-tmpalign/indels_35.tmp.npy\")" ] }, { "cell_type": "code", "execution_count": 457, "metadata": {}, "outputs": [], "source": [ "align_to_array(data, samples, chunk)" ] }, { "cell_type": "code", "execution_count": 459, "metadata": {}, "outputs": [], "source": [ "chunks = glob.glob(\"4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_*\")\n", "for chunk in chunks:\n", " align_to_array(data, samples, chunk)" ] }, { "cell_type": "code", "execution_count": 263, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "@numba.jit(nopython=True)\n", "def refbuild(iseq, consdict):\n", " \"\"\" Returns the most common base at each site in order. \"\"\"\n", "\n", " altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8)\n", "\n", " for col in range(iseq.shape[1]):\n", " ## expand colums with ambigs and remove N-\n", " fcounts = np.zeros(111, dtype=np.int64)\n", " counts = np.bincount(iseq[:, col])#, minlength=90)\n", " fcounts[:counts.shape[0]] = counts\n", " ## set N and - to zero, wish numba supported minlen arg\n", " fcounts[78] = 0\n", " fcounts[45] = 0\n", " ## add ambig counts to true bases\n", " for aidx in range(consdict.shape[0]):\n", " nbases = fcounts[consdict[aidx, 0]]\n", " for _ in range(nbases):\n", " fcounts[consdict[aidx, 1]] += 1\n", " fcounts[consdict[aidx, 2]] += 1\n", " fcounts[consdict[aidx, 0]] = 0\n", "\n", " ## now get counts from the modified counts arr\n", " who = np.argmax(fcounts)\n", " altrefs[col, 0] = who\n", " fcounts[who] = 0\n", "\n", " ## if an alt allele fill over the \".\" placeholder\n", " who = np.argmax(fcounts)\n", " if who:\n", " altrefs[col, 1] = who\n", " fcounts[who] = 0\n", "\n", " ## if 3rd or 4th alleles observed then add to arr\n", " who = np.argmax(fcounts)\n", " altrefs[col, 2] = who\n", " fcounts[who] = 0\n", "\n", " ## if 3rd or 4th alleles observed then add to arr\n", " who = np.argmax(fcounts)\n", " altrefs[col, 3] = who\n", "\n", " return altrefs\n", "\n", "\n", "PSEUDO_REF = np.array([\n", " [82, 71, 65],\n", " [75, 71, 84],\n", " [83, 71, 67],\n", " [89, 84, 67],\n", " [87, 84, 65],\n", " [77, 67, 65],\n", " [78, 78, 78],\n", " [45, 45, 45],\n", " ], dtype=np.uint8)\n", "\n" ] }, { "cell_type": "code", "execution_count": 274, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[84, 71, 67, ..., 67, 67, 71],\n", " [84, 71, 67, ..., 67, 67, 71],\n", " [84, 71, 67, ..., 67, 67, 71],\n", " ...,\n", " [84, 71, 67, ..., 67, 67, 71],\n", " [84, 71, 67, ..., 67, 67, 71],\n", " [84, 71, 67, ..., 67, 67, 71]], dtype=uint8)" ] }, "execution_count": 274, "metadata": {}, "output_type": "execute_result" } ], "source": [ "seqarr.view(\"u1\")" ] }, { "cell_type": "code", "execution_count": 132, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3]),\n", " array([3, 4, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 5, 6, 7]))" ] }, "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ "aseqs = np.array([\n", " list('ATG--GGA'),\n", " list('A--GGGGA'),\n", " list('---GGGGA'),\n", " list('--------'),\n", "])\n", "\n", "np.where(aseqs == \"-\")" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [], "source": [ "PSEUDO_REF = np.array([\n", " [82, 71, 65],\n", " [75, 71, 84],\n", " [83, 71, 67],\n", " [89, 84, 67],\n", " [87, 84, 65],\n", " [77, 67, 65],\n", " [78, 78, 78],\n", " [45, 45, 45],\n", " ], dtype=np.uint8)\n" ] }, { "cell_type": "code", "execution_count": 164, "metadata": {}, "outputs": [], "source": [ "@numba.jit(nopython=True)\n", "def reftrick(iseq, consdict):\n", " \"\"\" Returns the most common base at each site in order. \"\"\"\n", "\n", " altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8)\n", " #altrefs[:, 1] = 46\n", "\n", " for col in range(iseq.shape[1]):\n", " ## expand colums with ambigs and remove N-\n", " fcounts = np.zeros(111, dtype=np.int64)\n", " counts = np.bincount(iseq[:, col])#, minlength=90)\n", " fcounts[:counts.shape[0]] = counts\n", " ## set N and - to zero, wish numba supported minlen arg\n", " fcounts[78] = 0\n", " fcounts[45] = 0\n", " ## add ambig counts to true bases\n", " for aidx in range(consdict.shape[0]):\n", " nbases = fcounts[consdict[aidx, 0]]\n", " for _ in range(nbases):\n", " fcounts[consdict[aidx, 1]] += 1\n", " fcounts[consdict[aidx, 2]] += 1\n", " fcounts[consdict[aidx, 0]] = 0\n", "\n", " ## now get counts from the modified counts arr\n", " who = np.argmax(fcounts)\n", " altrefs[col, 0] = who\n", " fcounts[who] = 0\n", "\n", " ## if an alt allele fill over the \".\" placeholder\n", " who = np.argmax(fcounts)\n", " if who:\n", " altrefs[col, 1] = who\n", " fcounts[who] = 0\n", "\n", " ## if 3rd or 4th alleles observed then add to arr\n", " who = np.argmax(fcounts)\n", " altrefs[col, 2] = who\n", " fcounts[who] = 0\n", "\n", " ## if 3rd or 4th alleles observed then add to arr\n", " who = np.argmax(fcounts)\n", " altrefs[col, 3] = who\n", "\n", " return altrefs" ] }, { "cell_type": "code", "execution_count": 195, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[b'-', b'-', b'-', b'A', b'A', b'A', b'A', b'-', b'-', b'-', b'-'],\n", " [b'-', b'-', b'-', b'T', b'T', b'A', b'A', b'-', b'-', b'-', b'-'],\n", " [b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A'],\n", " [b'A', b'A', b'T', b'A', b'A', b'A', b'A', b'T', b'A', b'A', b'A']],\n", " dtype='|S1')" ] }, "execution_count": 195, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sss = np.array([\n", " list(\"---AAAA----\"),\n", " list(\"---TTAA----\"),\n", " list(\"AAAAAAAAAAA\"),\n", " list(\"AATAAAATAAA\"),\n", "], dtype=\"S1\")\n", "sss" ] }, { "cell_type": "code", "execution_count": 196, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[65, 0, 0, 0],\n", " [65, 0, 0, 0],\n", " [65, 84, 0, 0],\n", " [65, 84, 0, 0],\n", " [65, 84, 0, 0],\n", " [65, 0, 0, 0],\n", " [65, 0, 0, 0],\n", " [65, 84, 0, 0],\n", " [65, 0, 0, 0],\n", " [65, 0, 0, 0],\n", " [65, 0, 0, 0]], dtype=uint8)" ] }, "execution_count": 196, "metadata": {}, "output_type": "execute_result" } ], "source": [ "varmat = reftrick(sss.view(np.uint8), PSEUDO_REF)\n", "varmat" ] }, { "cell_type": "code", "execution_count": 226, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['>1A_0_7',\n", " '>1B_0_7',\n", " '>1D_0_7',\n", " '>1C_0_7',\n", " '>2E_0_7',\n", " '>2F_0_7',\n", " '>2H_0_7',\n", " '>2G_0_7',\n", " '>3I_0_7',\n", " '>3J_0_7',\n", " '>3L_0_7',\n", " '>3K_0_7']" ] }, "execution_count": 226, "metadata": {}, "output_type": "execute_result" } ], "source": [ "names" ] }, { "cell_type": "code", "execution_count": 202, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "31.3 µs ± 764 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" ] } ], "source": [ "%%timeit\n", "store_alleles(seqs)" ] }, { "cell_type": "code", "execution_count": 219, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n", " b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n", " b'TGCAGCTTCTGAAGCCTTAAATCCTCACGTCAACATGATGCCTWCATGAA',\n", " b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n", " b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n", " b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n", " b'TGCAGCTTCTGAAGCCTGCAATCCTCACGTCAATATGATGCCTACATGAA',\n", " b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n", " b'TGCAGCTYCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n", " b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n", " b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',\n", " b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA']" ] }, "execution_count": 219, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[b\"\".join(i) for i in arrseqs ]" ] }, { "cell_type": "code", "execution_count": 229, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['1A_0',\n", " '1B_0',\n", " '1C_0',\n", " '1D_0',\n", " '2E_0',\n", " '2F_0',\n", " '2G_0',\n", " '2H_0',\n", " '3I_0',\n", " '3J_0',\n", " '3K_0',\n", " '3L_0']" ] }, "execution_count": 229, "metadata": {}, "output_type": "execute_result" } ], "source": [ "snames" ] }, { "cell_type": "code", "execution_count": 237, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">1A_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n", ">1B_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n", ">1C_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n", ">1D_0_7 b'TGCAGCTTCTGAAGCCTTAAATCCTCACGTCAACATGATGCCTWCATGAA'\n", ">2E_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n", ">2F_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n", ">2G_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n", ">2H_0_7 b'TGCAGCTTCTGAAGCCTGCAATCCTCACGTCAATATGATGCCTACATGAA'\n", ">3I_0_7 b'TGCAGCTYCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n", ">3J_0_7 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n", ">3K_0_7 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n", ">3L_0_7 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'\n" ] } ], "source": [ "widx = np.argsort([i.rsplit(\";\", 1)[0] for i in keys])\n", "for i in widx:\n", " print(names[i], b\"\".join(seqarr[i]))" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([2, 3, 4, 7])" ] }, "execution_count": 184, "metadata": {}, "output_type": "execute_result" } ], "source": [ "consensus = varmat[:, 0]\n", "variants = np.nonzero(varmat[:, 1])[0]\n", "variants" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True],\n", " [ True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True],\n", " [ True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " False, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, False, True,\n", " True, True, True, True, True],\n", " [ True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True],\n", " [ True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True],\n", " [ True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True],\n", " [ True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, False,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, False, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True],\n", " [ True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True],\n", " [ True, True, True, True, True, True, True, False, True,\n", " True, False, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True],\n", " [ True, True, True, True, True, True, True, True, True,\n", " True, False, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True],\n", " [ True, True, True, True, True, True, True, True, True,\n", " True, False, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True],\n", " [ True, True, True, True, True, True, True, True, True,\n", " True, False, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " True, True, True, True, True]])" ] }, "execution_count": 148, "metadata": {}, "output_type": "execute_result" } ], "source": [ "seqarr == seqarr[0]" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [], "source": [ "for kidx in range(seqarr.shape[0]):\n", " pass" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n", " 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n", " 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n", " 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n", " ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n", " 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n", " 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n", " 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n", " ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n", " 'G', 'C', 'C', 'T', 'T', 'A', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n", " 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n", " 'G', 'C', 'C', 'T', 'W', 'C', 'A', 'T', 'G', 'A', 'A'],\n", " ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n", " 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n", " 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n", " 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n", " ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n", " 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n", " 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n", " 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n", " ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n", " 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n", " 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n", " 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n", " ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n", " 'G', 'C', 'C', 'T', 'G', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n", " 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'T', 'A', 'T', 'G', 'A', 'T',\n", " 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n", " ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',\n", " 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n", " 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n", " 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n", " ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'Y', 'C', 'T', 'T', 'A', 'A',\n", " 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n", " 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n", " 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n", " ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'T', 'A', 'A',\n", " 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n", " 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n", " 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n", " ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'T', 'A', 'A',\n", " 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n", " 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n", " 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],\n", " ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'T', 'A', 'A',\n", " 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',\n", " 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',\n", " 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A']],\n", " dtype='\")\n", "dalign1 = dict([i.split(\"\\n\", 1) for i in lines])\n", "keys = sorted(\n", " dalign1.keys(), \n", " key=lambda x: int(x.rsplit(\"*\")[-1])\n", ")\n", "seqarr = np.zeros(\n", " (len(nnames), len(dalign1[keys[0]])),\n", " dtype='U1',\n", " )\n", "for kidx, key in enumerate(keys):\n", " concatseq = dalign1[key].replace(\"\\n\", '')\n", " seqarr[kidx] = list(dalign1[key].replace(\"\\n\", \"\"))\n", "\n", "seqarr" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# fill in edges for each seq\n", "edg = (\n", " len(concatseq) - len(concatseq.lstrip('-')),\n", " len(concatseq.rstrip('-')),\n", " len(concatseq.rstrip('-')),\n", " len(concatseq.rstrip('-')),\n", " ) \n", "edges[ldx, kidx] = edg\n", "\n", "edges[ldx, kidx]\n", "\n", "concatseq[0:50] + 'nnnn'[50:50] + concatseq[50:50]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ " seeds = [\n", " os.path.join(\n", " data.dirs.across, \n", " \"{}-{}.htemp\".format(data.name, jobid)) for jobid in jobids\n", " ]\n", " allseeds = os.path.join(data.dirs.across, \"{}-x.fa\".format(data.name))\n", "\n", " cmd1 = ['cat'] + seeds\n", " cmd2 = [ipyrad.bins.vsearch, '--sortbylength', '-', '--output', allseeds]\n", " proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)\n", " proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=sps.PIPE, close_fds=True)\n", " out = proc2.communicate()\n", " proc1.stdout.close()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(b'', None)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "out" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "data = s.data\n", "jobid = 2\n", "nthreads = 4" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ " # get files for this jobid\n", " catshuf = os.path.join(\n", " data.dirs.across, \n", " \"{}-{}-catshuf.fa\".format(data.name, jobid))\n", " uhaplos = os.path.join(\n", " data.dirs.across, \n", " \"{}-{}.utemp\".format(data.name, jobid))\n", " hhaplos = os.path.join(\n", " data.dirs.across, \n", " \"{}-{}.htemp\".format(data.name, jobid))\n", " #logfile = os.path.join(data.dirs.across, \"s6_cluster_stats.txt\")\n", "\n", " ## parameters that vary by datatype\n", " ## (too low of cov values yield too many poor alignments)\n", " strand = \"plus\"\n", " cov = 0.75 # 0.90\n", " if data.paramsdict[\"datatype\"] in [\"gbs\", \"2brad\"]:\n", " strand = \"both\"\n", " cov = 0.60\n", " elif data.paramsdict[\"datatype\"] == \"pairgbs\":\n", " strand = \"both\"\n", " cov = 0.75 # 0.90\n", "\n", " ## nthreads is calculated in 'call_cluster()'\n", " cmd = [ipyrad.bins.vsearch,\n", " \"-cluster_smallmem\", catshuf,\n", " \"-strand\", strand,\n", " \"-query_cov\", str(cov),\n", " \"-minsl\", str(0.5),\n", " \"-id\", str(data.paramsdict[\"clust_threshold\"]),\n", " \"-userout\", uhaplos,\n", " \"-notmatched\", hhaplos,\n", " \"-userfields\", \"query+target+qstrand\",\n", " \"-maxaccepts\", \"1\",\n", " \"-maxrejects\", \"0\",\n", " \"-fasta_width\", \"0\",\n", " \"-threads\", str(nthreads), # \"0\",\n", " \"-fulldp\",\n", " \"-usersort\",\n", " ]\n", " proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)\n", " out = proc.communicate()\n", " if proc.returncode:\n", " raise IPyradError(out)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)\n", "out = proc.communicate()\n", "if proc.returncode:\n", " raise IPyradError(out)\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "build_concat_files(s.data, 1, [s.data.samples[i] for i in s.cgroups[1]], 123)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "data = s.data\n", "jobid = 1\n", "samples = [s.data.samples[i] for i in s.cgroups[1]]\n", "randomseed = 123" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "conshandles = [\n", " sample.files.consens[0] for sample in samples if \n", " sample.stats.reads_consens]\n", "conshandles.sort()\n", "assert conshandles, \"no consensus files found\"\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(None, None)" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ " ## concatenate all of the gzipped consens files\n", " cmd = ['cat'] + conshandles\n", " groupcons = os.path.join(\n", " data.dirs.across, \n", " \"{}-{}-catcons.gz\".format(data.name, jobid))\n", " LOGGER.debug(\" \".join(cmd))\n", " with open(groupcons, 'w') as output:\n", " call = sps.Popen(cmd, stdout=output, close_fds=True)\n", " call.communicate()\n", "\n", " ## a string of sed substitutions for temporarily replacing hetero sites\n", " ## skips lines with '>', so it doesn't affect taxon names\n", " subs = [\"/>/!s/W/A/g\", \"/>/!s/w/A/g\", \"/>/!s/R/A/g\", \"/>/!s/r/A/g\",\n", " \"/>/!s/M/A/g\", \"/>/!s/m/A/g\", \"/>/!s/K/T/g\", \"/>/!s/k/T/g\",\n", " \"/>/!s/S/C/g\", \"/>/!s/s/C/g\", \"/>/!s/Y/C/g\", \"/>/!s/y/C/g\"]\n", " subs = \";\".join(subs)\n", "\n", " ## impute pseudo-haplo information to avoid mismatch at hetero sites\n", " ## the read data with hetero sites is put back into clustered data later.\n", " ## pipe passed data from gunzip to sed.\n", " cmd1 = [\"gunzip\", \"-c\", groupcons]\n", " cmd2 = [\"sed\", subs]\n", " LOGGER.debug(\" \".join(cmd1))\n", " LOGGER.debug(\" \".join(cmd2))\n", "\n", " proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)\n", " allhaps = groupcons.replace(\"-catcons.gz\", \"-cathaps.fa\")\n", " with open(allhaps, 'w') as output:\n", " proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)\n", " proc2.communicate()\n", " proc1.stdout.close()\n", "\n", " ## now sort the file using vsearch\n", " allsort = groupcons.replace(\"-catcons.gz\", \"-catsort.fa\")\n", " cmd1 = [ipyrad.bins.vsearch,\n", " \"--sortbylength\", allhaps,\n", " \"--fasta_width\", \"0\",\n", " \"--output\", allsort]\n", " LOGGER.debug(\" \".join(cmd1))\n", " proc1 = sps.Popen(cmd1, close_fds=True)\n", " proc1.communicate()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "''" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dirs.across" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# calculate the theading of cluster1 jobs:\n", "ncpus = len(self.ipyclient.ids)\n", "njobs = len(self.cgroups)\n", "nnodes = len(self.hostd)\n", "minthreads = 2\n", "maxthreads = 10" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# product\n", "targets = [0, 1]\n", "self.thview = self.ipyclient.load_balanced_view(targets=targets)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:02 | concatenating inputs | s6 |\n" ] } ], "source": [ "s.prepare_concats()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "ename": "IPyradError", "evalue": "(b'vsearch v2.8.0_linux_x86_64, 15.5GB RAM, 4 cores\\nhttps://github.com/torognes/vsearch\\n\\n\\n\\nUnable to open file for reading (/home/deren/Documents/ipyrad/tests/4-refpairtest_across/4-refpairtest-2-catshuf.fa)\\n', None)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIPyradError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcluster1\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m\u001b[0m in \u001b[0;36mcluster1\u001b[0;34m(data, jobid, nthreads)\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommunicate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mproc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mIPyradError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mIPyradError\u001b[0m: (b'vsearch v2.8.0_linux_x86_64, 15.5GB RAM, 4 cores\\nhttps://github.com/torognes/vsearch\\n\\n\\n\\nUnable to open file for reading (/home/deren/Documents/ipyrad/tests/4-refpairtest_across/4-refpairtest-2-catshuf.fa)\\n', None)" ] } ], "source": [ "cluster1(data, 2, 4)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "def cluster1(data, jobid, nthreads):\n", "\n", " # get files for this jobid\n", " catshuf = os.path.join(\n", " data.dirs.across, \n", " \"{}-{}-catshuf.fa\".format(data.name, jobid))\n", " uhaplos = os.path.join(\n", " data.dirs.across, \n", " \"{}-{}.utemp\".format(data.name, jobid))\n", " hhaplos = os.path.join(\n", " data.dirs.across, \n", " \"{}-{}.htemp\".format(data.name, jobid))\n", "\n", " ## parameters that vary by datatype\n", " ## (too low of cov values yield too many poor alignments)\n", " strand = \"plus\"\n", " cov = 0.75 # 0.90\n", " if data.paramsdict[\"datatype\"] in [\"gbs\", \"2brad\"]:\n", " strand = \"both\"\n", " cov = 0.60\n", " elif data.paramsdict[\"datatype\"] == \"pairgbs\":\n", " strand = \"both\"\n", " cov = 0.75 # 0.90\n", "\n", " ## nthreads is calculated in 'call_cluster()'\n", " cmd = [ipyrad.bins.vsearch,\n", " \"-cluster_smallmem\", catshuf,\n", " \"-strand\", strand,\n", " \"-query_cov\", str(cov),\n", " \"-minsl\", str(0.5),\n", " \"-id\", str(data.paramsdict[\"clust_threshold\"]),\n", " \"-userout\", uhaplos,\n", " \"-notmatched\", hhaplos,\n", " \"-userfields\", \"query+target+qstrand\",\n", " \"-maxaccepts\", \"1\",\n", " \"-maxrejects\", \"0\",\n", " \"-fasta_width\", \"0\",\n", " \"-threads\", str(nthreads), # \"0\",\n", " \"-fulldp\",\n", " \"-usersort\",\n", " ]\n", " proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)\n", " out = proc.communicate()\n", " if proc.returncode:\n", " raise IPyradError(out)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "if len(self.samples) < 50:\n", " nclusters = 1\n", "elif len(self.samples) < 100:\n", " nclusters = 2\n", "elif len(self.samples) < 200:\n", " nclusters = 4\n", "elif len(self.samples) > 500:\n", " nclusters = 10\n", "else:\n", " nclusters = int(len(self.samples) / 10)\n", " \n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{0: ['1A_0', '1B_0', '1C_0', '1D_0'],\n", " 1: ['2E_0', '2F_0', '2G_0', '2H_0'],\n", " 2: ['3I_0', '3J_0', '3K_0', '3L_0']}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "self.cgroups" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'oud': [0, 1, 2, 3]}" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hosts\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }