{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import ipyrad as ip\n", "import ipyparallel as ipp " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.clustmap import *\n", "from ipyrad.assemble.consens_se import *" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:459: RuntimeWarning: \n", " Controller appears to be listening on localhost, but not on this machine.\n", " If this is true, you should specify Client(...,sshserver='you@oud')\n", " or instruct your controller to listen on an external IP.\n", " RuntimeWarning)\n" ] } ], "source": [ "ipyclient = ipp.Client()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New Assembly: test\n", "0 assembly_name test \n", "1 project_dir ./hotfix \n", "2 raw_fastq_path ./ipsimdata/pairddrad_example_R*_.fastq.gz \n", "3 barcodes_path ./ipsimdata/pairddrad_example_barcodes.txt \n", "4 sorted_fastq_path \n", "5 assembly_method reference \n", "6 reference_sequence ./ipsimdata/pairddrad_example_genome.fa \n", "7 datatype pairddrad \n", "8 restriction_overhang ('TGCAG', 'CCG') \n", "9 max_low_qual_bases 5 \n", "10 phred_Qscore_offset 33 \n", "11 mindepth_statistical 6 \n", "12 mindepth_majrule 6 \n", "13 maxdepth 10000 \n", "14 clust_threshold 0.85 \n", "15 max_barcode_mismatch 0 \n", "16 filter_adapters 0 \n", "17 filter_min_trim_len 35 \n", "18 max_alleles_consens 2 \n", "19 max_Ns_consens (5, 5) \n", "20 max_Hs_consens (8, 8) \n", "21 min_samples_locus 4 \n", "22 max_SNPs_locus (20, 20) \n", "23 max_Indels_locus (8, 8) \n", "24 max_shared_Hs_locus 0.5 \n", "25 trim_reads (0, 0, 0, 0) \n", "26 trim_loci (0, 0, 0, 0) \n", "27 output_formats ['p', 's', 'v'] \n", "28 pop_assign_file \n" ] } ], "source": [ "data = ip.Assembly(\"test\")\n", "data.set_params(\"project_dir\", \"hotfix\")\n", "data.set_params(\"raw_fastq_path\", \"ipsimdata/pairddrad_example_R*_.fastq.gz\")\n", "data.set_params(\"barcodes_path\", \"ipsimdata/pairddrad_example_barcodes.txt\")\n", "data.set_params(\"reference_sequence\", \"ipsimdata/pairddrad_example_genome.fa\")\n", "data.set_params(\"assembly_method\", \"reference\")\n", "data.set_params(\"datatype\", \"pairddrad\")\n", "data.set_params(\"restriction_overhang\", (\"TGCAG\", \"CCG\")) \n", "data.get_params()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: test\n", "[####################] 100% 0:00:04 | sorting reads | s1 |\n", "[####################] 100% 0:00:01 | writing/compressing | s1 |\n", "[####################] 100% 0:00:04 | processing reads | s2 |\n" ] } ], "source": [ "data.run(\"12\", force=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: test\n", "[####################] 100% 0:00:00 | indexing reference | s3 |\n", "[####################] 100% 0:00:00 | concatenating | s3 |\n", "[####################] 100% 0:00:01 | join unmerged pairs | s3 |\n", "[####################] 100% 0:00:03 | dereplicating | s3 |\n", "[####################] 100% 0:00:00 | splitting dereps | s3 |\n", "[####################] 100% 0:00:03 | mapping reads | s3 |\n", "[####################] 100% 0:00:10 | building clusters | s3 |\n", "[####################] 100% 0:00:00 | calc cluster stats | s3 |\n" ] } ], "source": [ "data.run(\"3\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: test\n", "[####################] 100% 0:00:06 | inferring [H, E] | s4 |\n" ] } ], "source": [ "data.run('4')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: test\n", "[####################] 100% 0:00:00 | calculating depths | s5 |\n", "[####################] 100% 0:00:00 | chunking clusters | s5 |\n", "[####################] 100% 0:00:27 | consens calling | s5 |\n", "[####################] 100% 0:00:01 | indexing alleles | s5 |\n" ] } ], "source": [ "data.run(\"5\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: test\n", "[####################] 100% 0:00:00 | concatenating bams | s6 |\n", "[####################] 100% 0:00:00 | fetching regions | s6 |\n", "[####################] 100% 0:00:00 | building loci | s6 |\n" ] } ], "source": [ "data.run(\"6\")" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.write_outputs import *\n", "bself = Step7(data, True, ipyclient)\n", "self.split_clusters()\n", "\n", "jobs = glob.glob(os.path.join(self.data.tmpdir, \"chunk-*\"))\n", "jobs = sorted(jobs, key=lambda x: int(x.rsplit(\"-\")[-1])) \n", "for jobfile in jobs:\n", " args = (self.data, self.chunksize, jobfile)" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "def get_edges(self, seqs):\n", " \"\"\"\n", " Trim terminal edges or mask internal edges based on three criteria and\n", " take the max for each edge.\n", " 1. user entered hard trimming.\n", " 2. removing cutsite overhangs.\n", " 3. trimming singleton-like overhangs from seqs of diff lengths.\n", " \"\"\"\n", " # record whether to filter this locus based on sample coverage\n", " bad = False\n", "\n", " # 1. hard trim edges\n", " trim1 = np.array(self.data.paramsdict[\"trim_loci\"])\n", "\n", " # 2. fuzzy match for trimming restriction site where it's expected.\n", " trim2 = np.array([0, 0, 0, 0])\n", " overhangs = np.array([\n", " i.encode() for i in self.data.paramsdict[\"restriction_overhang\"]\n", " ])\n", " for pos, overhang in enumerate(overhangs):\n", " if overhang:\n", " cutter = np.array(list(overhang))\n", " trim2[pos] = check_cutter(seqs, pos, cutter, 0.75)\n", "\n", " # 3. find where the edge is not indel marked (really unknown (\"N\"))\n", " trim3 = np.array([0, 0, 0, 0])\n", " try: \n", " minsamp = min(4, seqs.shape[0])\n", " # minsamp = max(minsamp, self.data.paramsdict[\"min_samples_locus\"])\n", " mincovs = np.sum((seqs != 78) & (seqs != 45), axis=0)\n", " for pos in range(4):\n", " trim3[pos] = check_minsamp(seqs, pos, minsamp, mincovs)\n", " except ValueError:\n", " print('error')\n", " bad = True\n", "\n", " # get max edges\n", " print(trim1, trim2, trim3)\n", " trim = np.max([trim1, trim2, trim3], axis=0)\n", "\n", " # return edges as slice indices\n", " r1left = trim[0]\n", " \n", " # single-end simple:\n", " if \"pair\" not in self.data.paramsdict[\"datatype\"]:\n", " r1right = seqs.shape[1] - trim[1]\n", " r2left = r2right = r1right\n", " edges = (r1left, r1right, r2left, r2right)\n", " \n", " else:\n", " r1right = \n", " \n", " \n", "\n", " # get filter\n", " if (r1right < r1left) or (r2left < r1right) or (r2right < r2left):\n", " bad = True\n", "\n", " # if loc length is too short then filter\n", " if (r2right - r1left) < self.data.paramsdict[\"filter_min_trim_len\"]:\n", " bad = True\n", "\n", " return bad, edges" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "def check_minsamp(seq, position, minsamp, mincovs):\n", " \"used in Processor.get_edges() for trimming edges of - or N sites.\"\n", " \n", " if position == 0: \n", " # find leftmost edge with minsamp coverage\n", " leftmost = np.where(mincovs >= minsamp)[0]\n", " if leftmost.size:\n", " return leftmost.min()\n", " \n", " # no sites actually have minsamp coverage although there are minsamp\n", " # rows of data, this can happen when reads only partially overlap. Loc\n", " # should be excluded for minsamp filter.\n", " else:\n", " raise ValueError(\"no sites above minsamp coverage in edge trim\")\n", " \n", " elif position == 1:\n", " maxhit = np.where(mincovs >= minsamp)[0].max()\n", " return seq.shape[1] - (maxhit + 1)\n", "\n", " ## TODO... \n", " elif position == 2:\n", " return 0\n", " \n", " else:\n", " return 0" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 456, 456, 456)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 471, 471, 471)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 466, 466, 466)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 478, 478, 478)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 464, 464, 464)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 459, 459, 459)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 484, 484, 484)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 441, 441, 441)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 470, 470, 470)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 486, 486, 486)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 442, 442, 442)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 465, 465, 465)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 478, 478, 478)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 453, 453, 453)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 453, 453, 453)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 466, 466, 466)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 473, 473, 473)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 441, 441, 441)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 438, 438, 438)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 464, 464, 464)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 482, 482, 482)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 443, 443, 443)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 469, 469, 469)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 456, 456, 456)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 480, 480, 480)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 463, 463, 463)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 478, 478, 478)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 438, 438, 438)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 442, 442, 442)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 443, 443, 443)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 454, 454, 454)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 467, 467, 467)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 451, 451, 451)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 472, 472, 472)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 476, 476, 476)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 439, 439, 439)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 457, 457, 457)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 453, 453, 453)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 475, 475, 475)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 448, 448, 448)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 440, 440, 440)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 460, 460, 460)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 469, 469, 469)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 473, 473, 473)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 444, 444, 444)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 442, 442, 442)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 449, 449, 449)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 484, 484, 484)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 486, 486, 486)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 478, 478, 478)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 467, 467, 467)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 479, 479, 479)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 476, 476, 476)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 446, 446, 446)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 452, 452, 452)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 478, 478, 478)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 450, 450, 450)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 458, 458, 458)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 453, 453, 453)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 452, 452, 452)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 451, 451, 451)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 459, 459, 459)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 443, 443, 443)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 484, 484, 484)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 453, 453, 453)\n", "[0 0 0 0] [5 3 0 0] [0 0 0 0]\n", "False (5, 442, 442, 442)\n" ] } ], "source": [ "# store list of edge trims for VCF building\n", "edgelist = []\n", "\n", "# todo: this could be an iterator...\n", "with open(self.chunkfile, 'rb') as infile:\n", " loci = infile.read().split(b\"//\\n//\\n\")\n", "\n", " # iterate over loci\n", " for iloc, loc in enumerate(loci): \n", " # load names and seqs \n", " lines = loc.decode().strip().split(\"\\n\")\n", " names = []\n", " nidxs = []\n", " aseqs = []\n", " useqs = []\n", " for line in lines:\n", " if line[0] == \">\":\n", " name, nidx = line[1:].rsplit(\"_\", 1)\n", " names.append(name)\n", " nidxs.append(nidx)\n", " else:\n", " aseqs.append(list(line))\n", " useqs.append(list(line.upper()))\n", "\n", " # filter to include only samples in this assembly\n", " mask = [i in self.data.snames for i in names]\n", " names = np.array(names)[mask].tolist()\n", " nidxs = np.array(nidxs)[mask].tolist()\n", " useqs = np.array(useqs)[mask, :].astype(bytes).view(np.uint8)\n", " aseqs = np.array(aseqs)[mask, :].astype(bytes).view(np.uint8)\n", "\n", " # apply filters\n", " efilter, edges = get_edges(self, useqs)\n", " print(efilter, edges)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "data = self.data\n", "chunksize = self.chunksize\n", "chunkfile = jobs[0]\n", "\n", "self = Processor(data, chunksize, chunkfile)\n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:01 | applying filters | s7 |\n" ] } ], "source": [ "self.remote_process_chunks()\n", "self.collect_stats()\n", "self.store_file_handles()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "start = time.time()\n", "printstr = (\"building arrays \", \"s7\")\n", "rasyncs = {}\n", "args0 = (self.data,)\n", "args1 = (self.data, self.ntaxa, self.nbases, self.nloci)\n", "args2 = (self.data, self.ntaxa, self.nsnps)\n", "\n", "write_loci_and_alleles(*args0)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "fill_seq_array(*args1)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 6\n", "6 16\n", "16 28\n", "28 37\n", "37 46\n", "46 51\n", "51 61\n", "61 74\n", "74 83\n", "83 89\n", "89 102\n", "102 112\n", "112 121\n", "121 128\n", "128 134\n", "134 142\n", "142 149\n", "149 158\n", "158 163\n", "163 172\n", "172 187\n", "187 201\n", "201 208\n", "208 217\n", "217 226\n", "226 239\n", "239 246\n", "246 257\n", "257 265\n", "265 273\n", "273 277\n", "277 290\n", "290 297\n", "297 301\n", "301 311\n", "311 320\n", "320 330\n", "330 340\n", "340 352\n", "352 361\n", "361 369\n", "369 381\n", "381 391\n", "391 404\n", "404 413\n", "413 419\n", "419 432\n", "432 442\n", "442 458\n", "458 465\n", "465 482\n", "482 487\n", "487 492\n", "492 499\n", "499 511\n", "511 517\n", "517 530\n", "530 534\n", "534 543\n", "543 549\n", "549 562\n", "562 574\n", "574 585\n", "585 597\n", "597 612\n", "612 620\n", "620 627\n", "627 635\n", "635 645\n", "645 661\n", "661 667\n", "667 683\n", "683 691\n", "691 697\n", "697 710\n", "710 721\n", "721 737\n", "737 745\n", "745 751\n", "751 771\n", "771 779\n", "779 785\n", "785 792\n", "792 809\n", "809 819\n", "819 831\n", "831 838\n", "838 848\n", "848 863\n", "863 871\n", "871 881\n", "881 892\n", "892 895\n", "895 903\n", "903 921\n", "921 927\n", "927 934\n", "934 946\n", "946 954\n", "954 964\n", "964 975\n", "975 979\n", "979 989\n", "989 996\n", "996 1004\n", "1004 1014\n", "1014 1022\n", "1022 1031\n", "1031 1041\n", "1041 1050\n", "1050 1062\n", "1062 1066\n", "1066 1079\n", "1079 1090\n", "1090 1099\n", "1099 1112\n", "1112 1129\n", "1129 1140\n", "1140 1153\n", "1153 1162\n", "1162 1168\n", "1168 1180\n", "1180 1186\n", "1186 1193\n", "1193 1207\n", "1207 1212\n", "1212 1219\n", "1219 1230\n", "1230 1242\n", "1242 1254\n", "1254 1268\n", "1268 1277\n", "1277 1289\n", "1289 1298\n", "1298 1305\n", "1305 1317\n", "1317 1328\n", "1328 1338\n", "1338 1346\n", "1346 1353\n", "1353 1363\n", "1363 1372\n", "1372 1380\n", "1380 1397\n", "1397 1404\n", "1404 1419\n", "1419 1430\n", "1430 1437\n", "1437 1447\n", "1447 1459\n", "1459 1471\n", "1471 1478\n", "1478 1487\n", "1487 1492\n", "1492 1501\n", "1501 1512\n", "1512 1517\n", "1517 1525\n", "1525 1541\n", "1541 1555\n", "1555 1561\n", "1561 1577\n", "1577 1582\n", "1582 1595\n", "1595 1606\n", "1606 1612\n", "1612 1622\n", "1622 1635\n", "1635 1641\n", "1641 1651\n", "1651 1659\n", "1659 1662\n", "1662 1674\n", "1674 1684\n", "1684 1696\n", "1696 1703\n", "1703 1714\n", "1714 1721\n", "1721 1731\n", "1731 1737\n", "1737 1743\n", "1743 1750\n", "1750 1763\n", "1763 1775\n", "1775 1787\n", "1787 1794\n", "1794 1805\n", "1805 1812\n", "1812 1821\n", "1821 1827\n", "1827 1833\n", "1833 1847\n", "1847 1856\n", "1856 1867\n", "1867 1876\n", "1876 1886\n", "1886 1895\n", "1895 1904\n", "1904 1909\n", "1909 1922\n", "1922 1932\n", "1932 1943\n", "1943 1953\n", "1953 1963\n", "1963 1971\n", "1971 1984\n", "1984 1994\n", "1994 1997\n", "1997 2005\n", "2005 2014\n", "2014 2023\n", "2023 2042\n", "2042 2051\n", "2051 2063\n", "2063 2074\n", "2074 2082\n", "2082 2091\n", "2091 2100\n", "2100 2106\n", "2106 2119\n", "2119 2126\n", "2126 2138\n", "2138 2144\n", "2144 2155\n", "2155 2162\n", "2162 2171\n", "2171 2174\n", "2174 2183\n", "2183 2193\n", "2193 2205\n", "2205 2215\n", "2215 2225\n", "2225 2232\n", "2232 2242\n", "2242 2252\n", "2252 2256\n", "2256 2265\n", "2265 2274\n", "2274 2286\n", "2286 2298\n", "2298 2304\n", "2304 2321\n", "2321 2334\n", "2334 2342\n", "2342 2352\n", "2352 2357\n", "2357 2365\n", "2365 2375\n", "2375 2385\n", "2385 2389\n", "2389 2396\n", "2396 2400\n", "2400 2411\n", "2411 2419\n", "2419 2426\n", "2426 2440\n", "2440 2448\n", "2448 2460\n", "2460 2467\n", "2467 2479\n" ] }, { "ename": "ValueError", "evalue": "could not broadcast input array from shape (12) into shape (7)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtmploc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 87\u001b[0;31m \u001b[0mtmparr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0msidxs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msnpsites\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 88\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0;31m# store snpsmap data 1-indexed with chroms info\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: could not broadcast input array from shape (12) into shape (7)" ] } ], "source": [ "data = self.data\n", "ntaxa = self.ntaxa\n", "nsnps = self.nsnps\n", "\n", "# get faidict to convert chroms to ints\n", "if data.isref:\n", " faidict = chroms2ints(data, True)\n", "\n", "# open new database file handle\n", "with h5py.File(data.snps_database, 'w') as io5:\n", "\n", " # Database files for storing arrays on disk. \n", " # Should optimize for slicing by rows if we run into slow writing, or \n", " # it uses too much mem. For now letting h5py to auto-chunking.\n", " io5.create_dataset(\n", " name=\"snps\",\n", " shape=(ntaxa, nsnps),\n", " dtype=np.uint8,\n", " )\n", " # store snp locations:\n", " # (loc-counter, loc-snp-counter, loc-snp-pos, chrom, chrom-snp-pos)\n", " io5.create_dataset(\n", " name=\"snpsmap\",\n", " shape=(nsnps, 5),\n", " dtype=np.uint32,\n", " )\n", " # store snp locations\n", " io5.create_dataset(\n", " name=\"pseudoref\",\n", " shape=(nsnps, 4),\n", " dtype=np.uint8,\n", " )\n", " # store genotype calls (0/0, 0/1, 0/2, etc.)\n", " io5.create_dataset(\n", " name=\"genos\",\n", " shape=(nsnps, ntaxa, 2),\n", " dtype=np.uint8,\n", " )\n", "\n", " # gather all loci bits\n", " locibits = glob.glob(os.path.join(data.tmpdir, \"*.loci\"))\n", " sortbits = sorted(locibits, \n", " key=lambda x: int(x.rsplit(\"-\", 1)[-1][:-5]))\n", "\n", " # name order for entry in array\n", " sidxs = {sample: i for (i, sample) in enumerate(data.snames)}\n", "\n", " # iterate through file\n", " start = end = 0\n", " tmploc = {}\n", " locidx = 1\n", " snpidx = 1 \n", " \n", " # array to store until writing\n", " tmparr = np.zeros((ntaxa, nsnps), dtype=np.uint8)\n", " tmpmap = np.zeros((nsnps, 5), dtype=np.uint32)\n", "\n", " # iterate over chunkfiles\n", " for bit in sortbits:\n", " # iterate lines of file until locus endings\n", " for line in iter(open(bit, 'r')):\n", "\n", " # still filling locus until |\\n\n", " if \"|\\n\" not in line:\n", " name, seq = line.split()\n", " tmploc[name] = seq\n", "\n", " # locus is full, dump it\n", " else:\n", " # convert seqs to an array\n", " loc = (\n", " np.array([list(i) for i in tmploc.values()])\n", " .astype(bytes).view(np.uint8)\n", " )\n", " snps, idxs, _ = line[len(data.snppad):].rsplit(\"|\", 2)\n", " snpsmask = np.array(list(snps)) != \" \"\n", " snpsidx = np.where(snpsmask)[0]\n", "\n", " # select only the SNP sites\n", " snpsites = loc[:, snpsmask]\n", "\n", " # store end position of locus for map\n", " end = start + snpsites.shape[1]\n", " print(start, end)\n", " \n", " for idx, name in enumerate(tmploc):\n", " tmparr[sidxs[name], start:end] = snpsites[idx, :]\n", "\n", " # store snpsmap data 1-indexed with chroms info\n", " if data.isref:\n", " chrom, pos = idxs.split(\",\")[0].split(\":\")\n", " start = int(pos.split(\"-\")[0])\n", " #chromidx = faidict[chrom]\n", " chromidx = int(chrom)\n", " for isnp in range(snpsites.shape[1]):\n", " isnpx = snpsidx[isnp]\n", " tmpmap[snpidx - 1] = (\n", " locidx, isnp, isnpx, chromidx, isnpx + start,\n", " )\n", " snpidx += 1\n", " # store snpsmap data (snpidx is 1-indexed)\n", " else:\n", " for isnp in range(snpsites.shape[1]):\n", " tmpmap[snpidx - 1] = (\n", " locidx, isnp, snpsidx[isnp], 0, snpidx,\n", " )\n", " snpidx += 1\n", " locidx += 1\n", "\n", " # reset locus\n", " start = end\n", " tmploc = {}\n", "\n", " # fill missing with 78 (N)\n", " tmparr[tmparr == 0] = 78" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(13, 2474)\n", "2467 2479\n" ] }, { "data": { "text/plain": [ "array([0, 0, 0, 0, 0, 0, 0], dtype=uint8)" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(tmparr.shape)\n", "\n", "\n", "print(start, end)\n", "tmparr[sidxs[name], start:end]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "could not broadcast input array from shape (12) into shape (7)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfill_snp_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/write_outputs.py\u001b[0m in \u001b[0;36mfill_snp_array\u001b[0;34m(data, ntaxa, nsnps)\u001b[0m\n\u001b[1;32m 1585\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstart\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0msnpsites\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1586\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtmploc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1587\u001b[0;31m \u001b[0mtmparr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0msidxs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msnpsites\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1588\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1589\u001b[0m \u001b[0;31m# store snpsmap data 1-indexed with chroms info\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: could not broadcast input array from shape (12) into shape (7)" ] } ], "source": [ "fill_snp_array(*args2)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: test\n", "[####################] 100% 0:00:05 | applying filters | s7 |\n", "[####################] 100% 0:00:01 | building arrays | s7 |\n", "\n", "Encountered an unexpected error:\n", "ValueError(could not broadcast input array from shape (12) into shape (7))\n" ] }, { "ename": "RemoteError", "evalue": "ValueError(could not broadcast input array from shape (12) into shape (7))", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/write_outputs.py\u001b[0m in \u001b[0;36mfill_snp_array\u001b[0;34m(data, ntaxa, nsnps)\u001b[0m", "\u001b[1;32m 1585\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstart\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0msnpsites\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m", "\u001b[1;32m 1586\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtmploc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m", "\u001b[0;32m-> 1587\u001b[0;31m \u001b[0mtmparr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0msidxs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msnpsites\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m", "\u001b[0m\u001b[1;32m 1588\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m", "\u001b[1;32m 1589\u001b[0m \u001b[0;31m# store snpsmap data 1-indexed with chroms info\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m", "\u001b[0;31mValueError\u001b[0m: could not broadcast input array from shape (12) into shape (7)" ] } ], "source": [ "data.run(\"7\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: test\n", "[####################] 100% 0:00:00 | indexing reference | s3 |\n", "[####################] 100% 0:00:00 | concatenating | s3 |\n", "[####################] 100% 0:00:01 | join unmerged pairs | s3 |\n", "[####################] 100% 0:00:00 | dereplicating | s3 |\n", "[####################] 100% 0:00:00 | splitting dereps | s3 |\n", "[####################] 100% 0:00:02 | mapping reads | s3 |\n", "[####################] 100% 0:00:09 | building clusters | s3 |\n", "[####################] 100% 0:00:00 | calc cluster stats | s3 |\n" ] } ], "source": [ "data.run(\"3\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: test\n", "[####################] 100% 0:00:02 | inferring [H, E] | s4 |\n", "[####################] 100% 0:00:00 | calculating depths | s5 |\n", "[####################] 100% 0:00:00 | chunking clusters | s5 |\n", "[####################] 100% 0:00:24 | consens calling | s5 |\n", "[####################] 100% 0:00:01 | indexing alleles | s5 |\n", "[####################] 100% 0:00:00 | concatenating bams | s6 |\n", "[####################] 100% 0:00:00 | building clusters | s6 |\n", "\n", " Encountered an unexpected error (see ./ipyrad_log.txt)\n", " Error message is below -------------------------------\n", "name 'data' is not defined\n" ] } ], "source": [ "data.run(\"456\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: test\n", "[force] overwriting fastq files previously created by ipyrad.\n", "This _does not_ affect your original/raw data files.\n", "[####################] 100% 0:00:03 | sorting reads | s1 |\n", "[####################] 100% 0:00:00 | writing/compressing | s1 |\n" ] } ], "source": [ "data.run(\"1\", force=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading Assembly: 5-tortas\n", "from saved path: ~/Documents/ipyrad/tests/tortas/5-tortas.json\n" ] } ], "source": [ "data = ip.load_json(\"tortas/5-tortas.json\")\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "d2 = data.branch(\"d2\", subsamples=[\"AGO09concat\"])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: d2\n", "[####################] 100% 0:00:00 | indexing reference | s3 |\n", "[####################] 100% 0:00:00 | concatenating | s3 |\n", "[####################] 100% 0:02:44 | join unmerged pairs | s3 |\n", "[####################] 100% 0:01:14 | dereplicating | s3 |\n", "[####################] 100% 0:00:18 | splitting dereps | s3 |\n", "[####################] 100% 0:52:29 | mapping reads | s3 |\n", "[####################] 100% 0:55:43 | building clusters | s3 |\n", "[####################] 100% 0:00:13 | calc cluster stats | s3 |\n" ] } ], "source": [ "d2.run(\"3\", force=True)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: d2\n", "[####################] 100% 0:00:21 | inferring [H, E] | s4 |\n", "[####################] 100% 0:00:13 | calculating depths | s5 |\n", "[####################] 100% 0:00:22 | chunking clusters | s5 |\n", "[####################] 100% 15:27:41 | consens calling | s5 |\n", "[####################] 100% 0:00:41 | indexing alleles | s5 |\n" ] } ], "source": [ "d2.run(\"45\", force=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading Assembly: d2\n", "from saved path: ~/Documents/ipyrad/tests/tortas/d2.json\n" ] } ], "source": [ "d2 = ip.load_json(\"./tortas/d2.json\")\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statereads_rawreads_passed_filterrefseq_mapped_readsrefseq_unmapped_readsclusters_totalclusters_hidepthhetero_esterror_estreads_consens
AGO09concat515650127151210478307851681319613642793810810.0016360.001511380396
\n", "
" ], "text/plain": [ " state reads_raw reads_passed_filter refseq_mapped_reads \\\n", "AGO09concat 5 15650127 15121047 8307851 \n", "\n", " refseq_unmapped_reads clusters_total clusters_hidepth \\\n", "AGO09concat 6813196 1364279 381081 \n", "\n", " hetero_est error_est reads_consens \n", "AGO09concat 0.001636 0.001511 380396 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d2.stats" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "self = Step5(data, True, ipyclient)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:17 | calculating depths | s5 |\n" ] } ], "source": [ "self.remote_calculate_depths()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:28 | chunking clusters | s5 |\n" ] } ], "source": [ "self.remote_make_chunks()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 1:07:30 | consens calling | s5 |\n" ] } ], "source": [ "statsdicts = self.remote_process_chunks()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'AGO08concat': [({'name': 90642,\n", " 'heteros': 15544,\n", " 'nsites': 22873090,\n", " 'nconsens': 90642},\n", " {'depth': 227971, 'maxh': 30, 'maxn': 2}),\n", " ({'name': 406474, 'heteros': 17080, 'nsites': 22034591, 'nconsens': 87829},\n", " {'depth': 230745, 'maxh': 69, 'maxn': 2}),\n", " ({'name': 725373, 'heteros': 17170, 'nsites': 22095933, 'nconsens': 88083},\n", " {'depth': 230497, 'maxh': 62, 'maxn': 3}),\n", " ({'name': 1044454, 'heteros': 28259, 'nsites': 22420201, 'nconsens': 88519},\n", " {'depth': 229823, 'maxh': 301, 'maxn': 2})],\n", " 'AGO11concat': [({'name': 88491,\n", " 'heteros': 15532,\n", " 'nsites': 17265948,\n", " 'nconsens': 88491},\n", " {'depth': 144187, 'maxh': 31, 'maxn': 0}),\n", " ({'name': 319207, 'heteros': 16819, 'nsites': 16849885, 'nconsens': 86498},\n", " {'depth': 146149, 'maxh': 62, 'maxn': 0}),\n", " ({'name': 552196, 'heteros': 16745, 'nsites': 16895490, 'nconsens': 86778},\n", " {'depth': 145870, 'maxh': 61, 'maxn': 0}),\n", " ({'name': 785074, 'heteros': 26253, 'nsites': 17128785, 'nconsens': 86947},\n", " {'depth': 145382, 'maxh': 374, 'maxn': 0})],\n", " 'AGO02concat': [({'name': 79397,\n", " 'heteros': 14200,\n", " 'nsites': 16889465,\n", " 'nconsens': 79397},\n", " {'depth': 189044, 'maxh': 24, 'maxn': 0}),\n", " ({'name': 345488, 'heteros': 15332, 'nsites': 16378730, 'nconsens': 77023},\n", " {'depth': 191394, 'maxh': 48, 'maxn': 0}),\n", " ({'name': 614298, 'heteros': 15677, 'nsites': 16486755, 'nconsens': 77368},\n", " {'depth': 191048, 'maxh': 49, 'maxn': 0}),\n", " ({'name': 883384, 'heteros': 24439, 'nsites': 16840082, 'nconsens': 77989},\n", " {'depth': 190103, 'maxh': 370, 'maxn': 0})],\n", " 'AGO09concat': [({'name': 1, 'heteros': 0, 'nsites': 147, 'nconsens': 1},\n", " {'depth': 9, 'maxh': 0, 'maxn': 0})]}" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "statsdicts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "self.remote_concatenate_chunks()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "self.data_store(statsdicts)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statereads_rawreads_passed_filterrefseq_mapped_readsrefseq_unmapped_readsclusters_totalclusters_hidepthhetero_esterror_estreads_consens
AGO02concat511050294108006727210402359027010738573122710.0024550.001599310509
AGO08concat513408401130303297245593578473612745803555470.0028250.001497353139
AGO09concat5156501271512104783078516813196136427910.0027110.001882378504
AGO11concat51284893612370018785511645149029308303492450.0021080.001574347812
\n", "
" ], "text/plain": [ " state reads_raw reads_passed_filter refseq_mapped_reads \\\n", "AGO02concat 5 11050294 10800672 7210402 \n", "AGO08concat 5 13408401 13030329 7245593 \n", "AGO09concat 5 15650127 15121047 8307851 \n", "AGO11concat 5 12848936 12370018 7855116 \n", "\n", " refseq_unmapped_reads clusters_total clusters_hidepth \\\n", "AGO02concat 3590270 1073857 312271 \n", "AGO08concat 5784736 1274580 355547 \n", "AGO09concat 6813196 1364279 1 \n", "AGO11concat 4514902 930830 349245 \n", "\n", " hetero_est error_est reads_consens \n", "AGO02concat 0.002455 0.001599 310509 \n", "AGO08concat 0.002825 0.001497 353139 \n", "AGO09concat 0.002711 0.001882 378504 \n", "AGO11concat 0.002108 0.001574 347812 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.stats" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: dd\n", "[####################] 100% 0:00:17 | calculating depths | s5 |\n", "[####################] 100% 0:00:31 | chunking clusters | s5 |\n", "[####################] 100% 1 day, 1:41:43 | consens calling | s5 |\n", "[####################] 100% 0:00:18 | indexing alleles | s5 |\n", "\n", " Encountered an error (see details in ./ipyrad_log.txt)\n", " Error summary is below -------------------------------\n", "KeyboardInterrupt()\n" ] } ], "source": [ "dd = data.branch(\"dd\")\n", "dd.run(\"5\", force=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statereads_rawreads_passed_filterrefseq_mapped_readsrefseq_unmapped_readsclusters_totalclusters_hidepthhetero_esterror_estreads_consens
AGO02concat411050294108006727210402359027010738573122710.0015420.001308310509
AGO08concat413408401130303297245593578473612745803555470.0019100.001097353139
AGO09concat4156501271512104783078516813196136427910.0000420.002260378504
AGO11concat41284893612370018785511645149029308303492450.0013300.001307347812
\n", "
" ], "text/plain": [ " state reads_raw reads_passed_filter refseq_mapped_reads \\\n", "AGO02concat 4 11050294 10800672 7210402 \n", "AGO08concat 4 13408401 13030329 7245593 \n", "AGO09concat 4 15650127 15121047 8307851 \n", "AGO11concat 4 12848936 12370018 7855116 \n", "\n", " refseq_unmapped_reads clusters_total clusters_hidepth \\\n", "AGO02concat 3590270 1073857 312271 \n", "AGO08concat 5784736 1274580 355547 \n", "AGO09concat 6813196 1364279 1 \n", "AGO11concat 4514902 930830 349245 \n", "\n", " hetero_est error_est reads_consens \n", "AGO02concat 0.001542 0.001308 310509 \n", "AGO08concat 0.001910 0.001097 353139 \n", "AGO09concat 0.000042 0.002260 378504 \n", "AGO11concat 0.001330 0.001307 347812 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dd.run(\"3\", force=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: 5-tortas\n", "[####################] 100% 0:00:16 | calculating depths | s5 |\n", "[####################] 100% 0:00:29 | chunking clusters | s5 |\n", "[####################] 100% 1:04:59 | consens calling | s5 |\n", "[####################] 100% 0:00:59 | indexing alleles | s5 |\n", "\n", " Encountered an error (see details in ./ipyrad_log.txt)\n", " Error summary is below -------------------------------\n", "IPyradError(error in samtools: b'[E::sam_parse1] CIGAR and query sequence are of different length\\n[W::sam_read1] parse error at line 10786\\n[main_samview] truncated file.\\n')\n" ] } ], "source": [ "data.run(\"5\", force=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "step = Step6(data, True, ipyclient)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | concatenating bams | s6 |\n" ] } ], "source": [ "step.remote_concat_bams()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | building clusters | s6 |\n" ] } ], "source": [ "step.remote_build_ref_regions()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "self = step" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('MT', 5008, 5467),\n", " ('MT', 10476, 10950),\n", " ('MT', 15959, 16428),\n", " ('MT', 21437, 21918),\n", " ('MT', 26927, 27394),\n", " ('MT', 32403, 32865),\n", " ('MT', 37874, 38361),\n", " ('MT', 43370, 43814),\n", " ('MT', 48823, 49296),\n", " ('MT', 54305, 54794),\n", " ('MT', 59803, 60248),\n", " ('MT', 65257, 65725),\n", " ('MT', 70734, 71215),\n", " ('MT', 76224, 76680),\n", " ('MT', 81689, 82145),\n", " ('MT', 87154, 87623),\n", " ('MT', 92632, 93108),\n", " ('MT', 98117, 98561),\n", " ('MT', 103570, 104011),\n", " ('MT', 109020, 109487)]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "regs = self.regions[:20]\n", "regs" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('MT', 5008, 5467)\n", "4 0 459 459\n", "5 0 459 459\n", "7 0 459 459\n", "8 0 459 459\n", "11 0 459 459\n", "\n", "('MT', 10476, 10950)\n", "1 0 474 474\n", "3 0 474 474\n", "5 0 474 474\n", "7 0 474 474\n", "8 0 474 474\n", "11 0 474 474\n", "\n", "('MT', 15959, 16428)\n", "1 0 469 469\n", "4 0 469 469\n", "5 0 469 469\n", "7 0 469 469\n", "8 0 469 469\n", "9 0 469 469\n", "11 0 469 469\n", "\n", "('MT', 21437, 21918)\n", "1 0 481 481\n", "2 0 481 481\n", "3 0 481 481\n", "5 0 481 481\n", "6 0 481 481\n", "7 0 481 481\n", "8 0 481 481\n", "11 0 481 481\n", "12 0 481 481\n", "\n", "('MT', 26927, 27394)\n", "1 0 467 467\n", "2 0 467 467\n", "3 0 467 467\n", "4 0 467 467\n", "6 0 467 467\n", "7 0 467 467\n", "8 0 467 467\n", "10 0 467 467\n", "11 0 467 467\n", "12 0 467 467\n", "\n", "('MT', 32403, 32865)\n", "2 0 462 462\n", "3 0 462 462\n", "4 0 462 462\n", "5 0 462 462\n", "6 0 462 462\n", "7 0 462 462\n", "8 0 462 462\n", "9 0 462 462\n", "10 0 462 462\n", "11 0 462 462\n", "12 0 462 462\n", "\n", "('MT', 37874, 38361)\n", "3 0 487 487\n", "4 0 487 487\n", "5 0 487 487\n", "7 0 487 487\n", "9 0 487 487\n", "10 0 487 487\n", "\n", "('MT', 43370, 43814)\n", "1 0 444 444\n", "2 0 444 444\n", "3 0 444 444\n", "4 0 444 444\n", "6 0 444 444\n", "7 0 444 444\n", "8 0 444 444\n", "9 0 444 444\n", "11 0 444 444\n", "\n", "('MT', 48823, 49296)\n", "1 0 473 473\n", "2 0 473 473\n", "3 0 473 473\n", "4 0 473 473\n", "5 0 473 473\n", "6 0 473 473\n", "7 0 473 473\n", "9 0 473 473\n", "10 0 473 473\n", "11 0 473 473\n", "\n", "('MT', 54305, 54794)\n", "1 0 489 489\n", "2 0 489 489\n", "3 0 489 489\n", "4 0 489 489\n", "6 0 489 489\n", "7 0 489 489\n", "8 0 489 489\n", "9 0 489 489\n", "10 0 489 489\n", "12 0 489 489\n", "\n", "('MT', 59803, 60248)\n", "2 0 445 445\n", "3 0 445 445\n", "4 0 445 445\n", "6 0 445 445\n", "7 0 445 445\n", "8 0 445 445\n", "10 0 445 445\n", "11 0 445 445\n", "\n", "('MT', 65257, 65725)\n", "1 0 468 468\n", "2 0 468 468\n", "3 0 468 468\n", "4 0 468 468\n", "5 0 468 468\n", "6 0 468 468\n", "8 0 468 468\n", "11 0 468 468\n", "\n", "('MT', 70734, 71215)\n", "2 0 481 481\n", "4 0 481 481\n", "5 0 481 481\n", "7 0 481 481\n", "9 0 481 481\n", "10 0 481 481\n", "11 0 481 481\n", "12 0 481 481\n", "\n", "('MT', 76224, 76680)\n", "1 0 456 456\n", "2 0 456 456\n", "3 0 456 456\n", "4 0 456 456\n", "5 0 456 456\n", "6 0 456 456\n", "7 0 456 456\n", "8 0 456 456\n", "10 0 456 456\n", "11 0 456 456\n", "12 0 456 456\n", "\n", "('MT', 81689, 82145)\n", "1 0 456 456\n", "2 0 456 456\n", "3 0 456 456\n", "4 0 456 456\n", "6 0 456 456\n", "7 0 456 456\n", "8 0 456 456\n", "10 0 456 456\n", "12 0 456 456\n", "\n", "('MT', 87154, 87623)\n", "1 0 469 469\n", "2 0 469 469\n", "3 0 469 469\n", "4 0 469 469\n", "5 0 469 469\n", "6 0 469 469\n", "7 0 469 469\n", "9 0 469 469\n", "10 0 469 469\n", "11 0 469 469\n", "12 0 469 469\n", "\n", "('MT', 92632, 93108)\n", "1 0 476 476\n", "3 0 476 476\n", "4 0 476 476\n", "5 0 476 476\n", "7 0 476 476\n", "9 0 476 476\n", "10 0 476 476\n", "11 0 476 476\n", "12 0 476 476\n", "\n", "('MT', 98117, 98561)\n", "2 0 444 444\n", "3 0 444 444\n", "5 0 444 444\n", "6 0 444 444\n", "8 0 444 444\n", "9 0 444 444\n", "11 0 444 444\n", "\n", "('MT', 103570, 104011)\n", "1 0 441 441\n", "2 0 441 441\n", "4 0 441 441\n", "5 0 441 441\n", "6 0 441 441\n", "7 0 441 441\n", "8 0 441 441\n", "9 0 441 441\n", "10 0 441 441\n", "11 0 441 441\n", "\n", "('MT', 109020, 109487)\n", "1 0 467 467\n", "2 0 467 467\n", "3 0 467 467\n", "4 0 467 467\n", "6 0 467 467\n", "7 0 467 467\n", "8 0 467 467\n", "9 0 467 467\n", "10 0 467 467\n", "11 0 467 467\n", "12 0 467 467\n", "\n" ] } ], "source": [ "# access reads from bam file using pysam\n", "bamfile = AlignmentFile(\n", " os.path.join(\n", " self.data.dirs.across,\n", " \"cat.sorted.bam\"),\n", " 'rb')\n", "\n", "# catcons output file for raw clusters and db samples\n", "outfile = gzip.open(\n", " os.path.join(\n", " self.data.dirs.across,\n", " \"{}.catcons.gz\".format(self.data.name)),\n", " 'wb')\n", "\n", "# write header line to catcons with sample names\n", "snames = sorted([i.name for i in self.samples])\n", "nsamples = len(snames)\n", "outfile.write(\n", " b\" \".join([i.encode() for i in snames]) + b\"\\n\")\n", "\n", "# get clusters\n", "lidx = 0\n", "clusts = []\n", "# while 1:\n", "\n", "# try:\n", "# region = next(self.regions)\n", "# reads = bamfile.fetch(*region)\n", "# except StopIteration:\n", "# break\n", "\n", "for region in regs:\n", " reads = bamfile.fetch(*region)\n", " \n", " # get reference\n", " print(region)\n", " refn, refs = get_ref_region(\n", " data.paramsdict[\"reference_sequence\"], \n", " region[0], region[1]+1, region[2]+1) \n", " \n", " # build cluster dict for sorting \n", " rdict = {}\n", " for read in reads:\n", " rdict[read.qname] = read.seq \n", " keys = sorted(rdict.keys(), key=lambda x: x.rsplit(\":\", 2)[0])\n", " \n", " # build cluster based on map positions (reads are already oriented)\n", " arr = np.zeros((nsamples + 1, len(refs)), dtype=bytes)\n", " \n", " # fill it\n", " arr[0] = list(refs)\n", " for idx, key in enumerate(keys):\n", " # get start and stop from this seq\n", " sname = key.rsplit(\"_\", 1)[0]\n", " rstart, rstop = key.rsplit(\":\", 2)[-1].split(\"-\")\n", " sidx = snames.index(sname)\n", " \n", " # get start relative to ref\n", " start = int(rstart) - int(region[1]) - 1\n", " stop = start + int(rstop) - int(rstart)\n", " print(sidx + 1, start, stop, arr.shape[1])\n", " arr[sidx + 1, int(start): int(stop)] = list(rdict[key])\n", " \n", " print(\"\")\n", " arr[arr == b\"\"] = b\"N\"\n", " for line in arr:\n", " outfile.write(line.tostring() + b\"\\n\")\n", " outfile.write(b\"\\n\")\n", " \n", " \n", "outfile.close()" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "300 559 259 559 41920 42479 559\n", "('Contig0', 41619, 42478) 859 859\n" ] } ], "source": [ "print(start, stop, stop-start, len(rdict[key]), rstart, rstop, int(rstop) - int(rstart))\n", "print(region, region[2] - region[1], len(refs))" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "empty separator", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mkey\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mValueError\u001b[0m: empty separator" ] } ], "source": [ "key.split(\"\")" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['AGO02concat', 'AGO08concat', 'AGO09concat', 'AGO11concat']" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "snames" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGCTTGTATTAGAGATGGGGCTGAAGTCAAACCCTGACCTGAACAGTGAGAAAATCTGGTGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCCTCTGCCTCCCTCAAGAAGAAATCAAGAGKGGAAAGGAGCAGGGCGGGAGGTATGGGAAAGGAAGAATGGAATT'" ] }, "execution_count": 141, "metadata": {}, "output_type": "execute_result" } ], "source": [ "revcomp(\"AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCKCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\")" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "cannot copy sequence with size 231 to array axis with dimension 535", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0marr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrefs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 45\u001b[0;31m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 46\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[0;31m# get consens seq and variant site index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: cannot copy sequence with size 231 to array axis with dimension 535" ] } ], "source": [ " # get consens seq and variant site index \n", " clust = []\n", " avars = refvars(arr.view(np.uint8), PSEUDO_REF)\n", " dat = b\"\".join(avars.view(\"S1\")[:, 0]).decode()\n", " snpstring = \"\".join([\"*\" if i else \" \" for i in avars[:, 1]])\n", " clust.append(\"ref_{}:{}-{}\\n{}\".format(*region, dat))\n", "\n", " # or, just build variant string (or don't...)\n", " # write all loci with [locids, nsnps, npis, nindels, ?]\n", " for key in keys:\n", " clust.append(\"{}\\n{}\".format(key, rdict[key]))\n", " clust.append(\"SNPs\\n\" + snpstring)\n", " clusts.append(\"\\n\".join(clust))\n", "\n", " # advance locus counter\n", " lidx += 1\n", "\n", " # write chunks to file\n", " if not lidx % 1000:\n", " outfile.write(\n", " str.encode(\"\\n//\\n//\\n\".join(clusts) + \"\\n//\\n//\\n\"))\n", " clusts = []\n", "\n", "# write remaining\n", "if clusts: \n", " outfile.write(\n", " str.encode(\"\\n//\\n//\\n\".join(clusts) + \"\\n//\\n//\\n\"))\n", "outfile.close()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "cannot copy sequence with size 543 to array axis with dimension 559", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstep\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremote_build_ref_clusters\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/cluster_across.py\u001b[0m in \u001b[0;36mremote_build_ref_clusters\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0marr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 589\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 590\u001b[0;31m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 591\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[0;31m# get consens seq and variant site index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: cannot copy sequence with size 543 to array axis with dimension 559" ] } ], "source": [ "step.remote_build_ref_clusters()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.clustmap import *" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "self = Step3(data, 8, True, ipyclient)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | indexing reference | s3 |\n", "[####################] 100% 0:00:00 | concatenating | s3 |\n", "[####################] 100% 0:05:42 | join unmerged pairs | s3 |\n", "[####################] 100% 0:02:17 | dereplicating | s3 |\n", "[####################] 100% 0:00:39 | splitting dereps | s3 |\n", "[####################] 100% 1:53:03 | mapping reads | s3 |\n", "[####################] 100% 1:40:12 | building clusters | s3 |\n", "[####################] 100% 0:00:24 | calc cluster stats | s3 |\n" ] } ], "source": [ "self.run()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: 5-tortas\n", "[####################] 100% 0:15:41 | inferring [H, E] | s4 |\n", "[####################] 100% 0:00:26 | calculating depths | s5 |\n", "[####################] 100% 0:00:45 | chunking clusters | s5 |\n", "[####################] 100% 16:10:59 | consens calling | s5 |\n", "[####################] 100% 0:01:31 | indexing alleles | s5 |\n" ] } ], "source": [ "self.data.run(\"45\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | indexing reference | s3 |\n", "[####################] 100% 0:00:00 | concatenating | s3 |\n", "[####################] 100% 0:05:49 | join unmerged pairs | s3 |\n" ] } ], "source": [ "self.remote_index_refs()\n", "self.remote_run(\n", " function=concat_multiple_edits,\n", " printstr=(\"concatenating \", \"s3\"),\n", " args=(),\n", ")\n", "self.remote_run(\n", " function=merge_end_to_end,\n", " printstr=(\"join unmerged pairs \", \"s3\"),\n", " args=(False, False,),\n", ")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:02:22 | dereplicating | s3 |\n" ] } ], "source": [ "self.remote_run(\n", " function=dereplicate,\n", " printstr=(\"dereplicating \", \"s3\"),\n", " args=(self.nthreads,),\n", " threaded=True,\n", ")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:36 | splitting dereps | s3 |\n" ] } ], "source": [ "self.remote_run(\n", " function=split_endtoend_reads,\n", " printstr=(\"splitting dereps \", \"s3\"),\n", " args=(),\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[########## ] 50% 1:18:01 | mapping reads | s3 |" ] } ], "source": [ "self.remote_run(\n", " function=mapping_reads,\n", " printstr=(\"mapping reads \", \"s3\"),\n", " args=(self.nthreads,),\n", " threaded=True,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample = list(self.data.samples.values())[0]\n", "merge_end_to_end(self.data, sample, True, True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq'" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample = list(data.samples.values())[0]\n", "\n", "infiles = [\n", " os.path.join(data.dirs.edits, \"{}.trimmed_R1_.fastq.gz\".format(sample.name)),\n", " os.path.join(data.dirs.edits, \"{}_R1_concatedit.fq.gz\".format(sample.name)),\n", " os.path.join(data.tmpdir, \"{}_merged.fastq\".format(sample.name)),\n", " os.path.join(data.tmpdir, \"{}_declone.fastq\".format(sample.name)),\n", "]\n", "infiles = [i for i in infiles if os.path.exists(i)]\n", "infile = infiles[-1]\n", "\n", "infile" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/home/deren/Documents/ipyrad/bin/vsearch-linux-x86_64',\n", " '--derep_fulllength',\n", " '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq',\n", " '--strand',\n", " 'plus',\n", " '--output',\n", " '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_derep.fastq',\n", " '--threads',\n", " '2',\n", " '--fasta_width',\n", " '0',\n", " '--fastq_qmax',\n", " '1000',\n", " '--sizeout',\n", " '--relabel_md5']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "strand = \"plus\"\n", "if data.paramsdict[\"datatype\"] is ('gbs' or '2brad'):\n", " strand = \"both\"\n", "nthreads=2\n", "\n", "cmd = [\n", " ip.bins.vsearch,\n", " \"--derep_fulllength\", infile,\n", " \"--strand\", strand,\n", " \"--output\", os.path.join(data.tmpdir, sample.name + \"_derep.fastq\"),\n", " \"--threads\", str(nthreads),\n", " \"--fasta_width\", str(0),\n", " \"--fastq_qmax\", \"1000\",\n", " \"--sizeout\", \n", " \"--relabel_md5\",\n", "]\n", "cmd" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)\n", "errmsg = proc.communicate()[0]\n", "if proc.returncode:\n", " ip.logger.error(\"error inside dereplicate %s\", errmsg)\n", " raise IPyradWarningExit(errmsg)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ ] 0% 0:01:55 | dereplicating | s3 |" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprintstr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"dereplicating \"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"s3\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnthreads\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mthreaded\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m )\n", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36mremote_run\u001b[0;34m(self, printstr, function, args, threaded)\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0mready\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mrasyncs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrasyncs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_progressbar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprintstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 600\u001b[0;31m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 601\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 602\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "s.remote_run(\n", " function=dereplicate,\n", " printstr=(\"dereplicating \", \"s3\"),\n", " args=(s.nthreads,),\n", " threaded=True,\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }