{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import ipyrad as ip\n", "import ipyparallel as ipp" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: \n", " Controller appears to be listening on localhost, but not on this machine.\n", " If this is true, you should specify Client(...,sshserver='you@sacra')\n", " or instruct your controller to listen on an external IP.\n", " RuntimeWarning)\n" ] } ], "source": [ "ipyclient = ipp.Client()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading Assembly: test\n", "from saved path: ~/local/src/ipyrad/tests/6-tortas/test.json\n" ] }, { "ename": "AttributeError", "evalue": "'Assembly' object has no attribute 'remote_calculate_depths'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mtest2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbranch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"test2\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m#test2.run(\"5\", force=True)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mtest2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremote_calculate_depths\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mtest2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremote_make_chunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mAttributeError\u001b[0m: 'Assembly' object has no attribute 'remote_calculate_depths'" ] } ], "source": [ "data = ip.load_json(\"6-tortas/test.json\")\n", "test2 = data.branch(\"test2\")\n", "#test2.run(\"5\", force=True)\n", "test2.remote_calculate_depths()\n", "test2.remote_make_chunks()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading Assembly: test\n", "from saved path: ~/local/src/ipyrad/tests/6-tortas/test.json\n", "[####################] 100% 0:00:14 | calculating depths | s5 |\n", "[####################] 100% 0:00:31 | chunking clusters | s5 |\n" ] } ], "source": [ "self = Step5(test2, True, ipyclient)\n", "self.remote_calculate_depths()\n", "self.remote_make_chunks()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/home/deren/local/src/ipyrad/tests/6-tortas/test2-tmpdir/PZ03concat.chunk.35195.35195'" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = test2\n", "sample = data.samples[\"PZ03concat\"]\n", "isref = True\n", "for sample in [sample]:\n", " chunks = glob.glob(os.path.join(\n", " data.tmpdir,\n", " \"{}.chunk.*\".format(sample.name)))\n", " chunks.sort(key=lambda x: int(x.split('.')[-1]))\n", "chunkfile = chunks[1]\n", "chunkfile" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "p = Processor(data, sample, chunkfile, isref)\n", "p.run()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "({'name': 45206, 'heteros': 1576, 'nsites': 4168046, 'nconsens': 10011},\n", " {'depth': 25179, 'maxh': 3, 'maxn': 2})" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p.counters, p.filters" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "\n", "class Processor:\n", " def __init__(self, data, sample, chunkfile, isref):\n", " self.data = data\n", " self.sample = sample\n", " self.chunkfile = chunkfile\n", " self.isref = isref\n", "\n", " # prepare the processor\n", " self.set_params()\n", " self.init_counters()\n", " self.init_arrays() \n", " self.chroms2ints()\n", "\n", " # run the processor\n", " #self.run()\n", "\n", " def run(self):\n", " self.process_chunk()\n", " self.write_chunk()\n", "\n", " def set_params(self):\n", " # set max limits\n", " self.tmpnum = int(self.chunkfile.split(\".\")[-1])\n", " self.optim = int(self.chunkfile.split(\".\")[-2])\n", " self.este = self.data.stats.error_est.mean()\n", " self.esth = self.data.stats.hetero_est.mean()\n", " self.maxlen = self.data._hackersonly[\"max_fragment_length\"]\n", " if 'pair' in self.data.paramsdict[\"datatype\"]:\n", " self.maxhet = sum(self.data.paramsdict[\"max_Hs_consens\"])\n", " self.maxn = sum(self.data.paramsdict[\"max_Ns_consens\"])\n", " else:\n", " self.maxhet = self.data.paramsdict[\"max_Hs_consens\"][0]\n", " self.maxn = self.data.paramsdict[\"max_Ns_consens\"][0]\n", " # not enforced for ref\n", " if self.isref:\n", " self.maxn = int(1e6)\n", " \n", " def init_counters(self):\n", " # store data for stats counters.\n", " self.counters = {\n", " \"name\": self.tmpnum,\n", " \"heteros\": 0,\n", " \"nsites\": 0,\n", " \"nconsens\": 0,\n", " }\n", "\n", " # store data for what got filtered\n", " self.filters = {\n", " \"depth\": 0,\n", " \"maxh\": 0,\n", " \"maxn\": 0,\n", " }\n", "\n", " # store data for writing\n", " self.storeseq = {}\n", "\n", " def init_arrays(self):\n", " # local copies to use to fill the arrays\n", " self.catarr = np.zeros((self.optim, self.maxlen, 4), dtype=np.uint32)\n", " self.nallel = np.zeros((self.optim, ), dtype=np.uint8)\n", " self.refarr = np.zeros((self.optim, 3), dtype=np.int64)\n", "\n", " def chroms2ints(self):\n", " # if reference-mapped then parse the fai to get index number of chroms\n", " if self.isref:\n", " fai = pd.read_csv(\n", " self.data.paramsdict[\"reference_sequence\"] + \".fai\",\n", " names=['scaffold', 'size', 'sumsize', 'a', 'b'],\n", " sep=\"\\t\")\n", " self.faidict = {j: i for i, j in enumerate(fai.scaffold)}\n", " self.revdict = {j: i for i, j in self.faidict.items()}\n", "\n", " # ---------------------------------------------\n", " def process_chunk(self):\n", " # stream through the clusters\n", " with open(self.chunkfile, 'rb') as inclust:\n", " pairdealer = izip(*[iter(inclust)] * 2)\n", " done = 0\n", " while not done:\n", " done, chunk = clustdealer(pairdealer, 1)\n", " if chunk: \n", " self.parse_cluster(chunk)\n", " if self.filter_mindepth():\n", " self.build_consens_and_array()\n", " self.get_heteros()\n", " if self.filter_maxhetero():\n", " if self.filter_maxN_minLen():\n", " self.get_alleles()\n", " self.store_data()\n", "\n", "\n", " def parse_cluster(self, chunk):\n", " # get names and seqs\n", " piece = chunk[0].decode().strip().split(\"\\n\")\n", " self.names = piece[0::2]\n", " self.seqs = piece[1::2]\n", "\n", " # pull replicate read info from seqs\n", " self.reps = [int(n.split(\";\")[-2][5:]) for n in self.names]\n", "\n", " # ref positions\n", " self.ref_position = (-1, 0, 0)\n", " if self.isref:\n", " # parse position from name string\n", " rname = self.names[0].rsplit(\";\", 2)[0]\n", " chrom, posish = rname.rsplit(\":\")\n", " pos0, pos1 = posish.split(\"-\")\n", " # pull idx from .fai reference dict\n", " chromint = self.faidict[chrom] + 1\n", " self.ref_position = (int(chromint), int(pos0), int(pos1))\n", "\n", " def filter_mindepth(self):\n", " # exit if nreps is less than mindepth setting\n", " if not nfilter1(self.data, self.reps):\n", " self.filters['depth'] += 1\n", " return 0\n", " return 1\n", "\n", " def build_consens_and_array(self):\n", " # get stacks of base counts\n", " sseqs = [list(seq) for seq in self.seqs]\n", " arrayed = np.concatenate(\n", " [[seq] * rep for (seq, rep) in zip(sseqs, self.reps)]\n", " ).astype(bytes)\n", "\n", " # ! enforce maxlen limit !\n", " self.arrayed = arrayed[:, :self.maxlen]\n", " \n", " # get unphased consens sequence from arrayed\n", " self.consens = base_caller(\n", " self.arrayed, \n", " self.data.paramsdict[\"mindepth_majrule\"], \n", " self.data.paramsdict[\"mindepth_statistical\"],\n", " self.esth, \n", " self.este,\n", " )\n", "\n", " # trim Ns from the left and right ends\n", " self.consens[self.consens == b\"-\"] = b\"N\"\n", " trim = np.where(self.consens != b\"N\")[0]\n", " ltrim, rtrim = trim.min(), trim.max()\n", " self.consens = self.consens[ltrim:rtrim + 1]\n", " self.arrayed = self.arrayed[:, ltrim:rtrim + 1]\n", "\n", " # update position for trimming\n", " self.ref_position = (\n", " self.ref_position[0], \n", " self.ref_position[1] + ltrim,\n", " self.ref_position[1] + ltrim + rtrim + 1,\n", " )\n", "\n", " def get_heteros(self):\n", " self.hidx = [\n", " i for (i, j) in enumerate(self.consens) if \n", " j.decode() in list(\"RKSYWM\")]\n", " self.nheteros = len(self.hidx)\n", "\n", " def filter_maxhetero(self):\n", " if not nfilter2(self.nheteros, self.maxhet):\n", " self.filters['maxh'] += 1\n", " return 0\n", " return 1\n", "\n", " def filter_maxN_minLen(self):\n", " if not nfilter3(self.consens, self.maxn):\n", " self.filters['maxn'] += 1\n", " return 0\n", " return 1\n", "\n", " def get_alleles(self):\n", " # if less than two Hs then there is only one allele\n", " if len(self.hidx) < 2:\n", " self.nalleles = 1\n", " else:\n", " # array of hetero sites\n", " harray = self.arrayed[:, self.hidx]\n", " # remove reads with - or N at variable site\n", " harray = harray[~np.any(harray == b\"-\", axis=1)]\n", " harray = harray[~np.any(harray == b\"N\", axis=1)]\n", " # get counts of each allele (e.g., AT:2, CG:2)\n", " ccx = Counter([tuple(i) for i in harray])\n", " # remove low freq alleles if more than 2, since they may reflect\n", " # seq errors at hetero sites, making a third allele, or a new\n", " # allelic combination that is not real.\n", " if len(ccx) > 2:\n", " totdepth = harray.shape[0]\n", " cutoff = max(1, totdepth // 10)\n", " alleles = [i for i in ccx if ccx[i] > cutoff]\n", " else:\n", " alleles = ccx.keys()\n", " self.nalleles = len(alleles)\n", "\n", " if self.nalleles == 2:\n", " try:\n", " self.consens = storealleles(self.consens, self.hidx, alleles)\n", " except (IndexError, KeyError):\n", " # the H sites do not form good alleles\n", " ip.logger.info(\"failed at phasing loc, skipping\")\n", " ip.logger.info(\"\"\"\n", " consens %s\n", " hidx %s\n", " alleles %s\n", " \"\"\", self.consens, self.hidx, alleles)\n", " # todo return phase or no-phase and store for later\n", " # ...\n", "\n", " def store_data(self):\n", " # current counter\n", " cidx = self.counters[\"nconsens\"]\n", " self.nallel[cidx] = self.nalleles\n", " self.refarr[cidx] = self.ref_position\n", " # store a reduced array with only CATG\n", " catg = np.array(\n", " [np.sum(self.arrayed == i, axis=0) for i in [b'C', b'A', b'T', b'G']],\n", " dtype='uint16').T\n", " # do not allow ints larger than 65535 (uint16)\n", " self.catarr[cidx, :catg.shape[0], :] = catg\n", "\n", " # store the seqdata and advance counters\n", " self.storeseq[cidx] = b\"\".join(list(self.consens))\n", " self.counters[\"name\"] += 1\n", " self.counters[\"nconsens\"] += 1\n", " self.counters[\"heteros\"] += self.nheteros\n", "\n", " # ----------------------------------------------\n", " def write_chunk(self):\n", "\n", " # find last empty size\n", " end = np.where(np.all(self.refarr == 0, axis=1))[0]\n", " if np.any(end):\n", " end = end.min()\n", " else:\n", " end = self.refarr.shape[0]\n", "\n", " # write final consens string chunk\n", " consenshandle = os.path.join(\n", " self.data.tmpdir,\n", " \"{}_tmpcons.{}.{}\".format(self.sample.name, end, self.tmpnum))\n", "\n", " # write chunk. \n", " if self.storeseq:\n", " with open(consenshandle, 'wt') as outfile:\n", "\n", " # denovo just write the consens simple\n", " if not self.isref:\n", " outfile.write(\n", " \"\\n\".join(\n", " [\">\" + self.sample.name + \"_\" + str(key) + \\\n", " \"\\n\" + self.storeseq[key].decode() \n", " for key in self.storeseq]))\n", "\n", " # reference needs to store if the read is revcomp to reference\n", " else:\n", " outfile.write(\n", " \"\\n\".join(\n", " [\"{}:{}:{}-{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\"\n", " .format(\n", " self.sample.name,\n", " self.refarr[i][0],\n", " self.refarr[i][1],\n", " self.refarr[i][2],\n", " 0,\n", " self.revdict[self.refarr[i][0] - 1],\n", " self.refarr[i][1],\n", " 0,\n", " #\"{}M\".format(refarr[i][2] - refarr[i][1]),\n", " make_allele_cigar(\n", " \"\".join(['.' if i.islower() else i for i \n", " in self.storeseq[i].decode()]),\n", " \".\",\n", " \"S\",\n", " ),\n", " #\"{}M\".format(len(self.storeseq[i].decode())),\n", " \"*\",\n", " 0,\n", " self.refarr[i][2] - self.refarr[i][1],\n", " self.storeseq[i].decode(),\n", " \"*\",\n", " # \"XT:Z:{}\".format(\n", " # make_indel_cigar(storeseq[i].decode())\n", " # ),\n", "\n", " ) for i in self.storeseq.keys()]\n", " ))\n", "\n", " # store reduced size arrays with indexes matching to keep indexes\n", " tmp5 = consenshandle.replace(\"_tmpcons.\", \"_tmpcats.\")\n", " with h5py.File(tmp5, 'w') as io5:\n", " # reduce catarr to uint16 size for easier storage\n", " self.catarr[self.catarr > 65535] = 65535\n", " self.catarr = self.catarr.astype(np.uint16)\n", " io5.create_dataset(name=\"cats\", data=self.catarr[:end])\n", " io5.create_dataset(name=\"alls\", data=self.nallel[:end])\n", " io5.create_dataset(name=\"chroms\", data=self.refarr[:end])\n", " del self.catarr\n", " del self.nallel\n", " del self.refarr\n", "\n", " # return stats\n", " self.counters['nsites'] = sum([len(i) for i in self.storeseq.values()])\n", " del self.storeseq\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res = ipyclient.apply(process_chunks, *(data, sample, chunkfile, isref))\n", "res.result()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "statsdicts = Processor(data, sample, chunkfile, isref)\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "({'name': 10148, 'heteros': 1736, 'nsites': 4258854, 'nconsens': 10148},\n", " {'depth': 25043, 'maxh': 4, 'maxn': 0})" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "statsdicts.counters, statsdicts.filters" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<__main__.D at 0x7f167c837b70>" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "aa = ipyclient[0].apply(Processor, )" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 141, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "# write sequences to SAM file\n", "combs1 = glob.glob(os.path.join(\n", " data.tmpdir,\n", " \"{}_tmpcons.*\".format(sample.name)))\n", "combs1.sort(key=lambda x: int(x.split(\".\")[-1]))\n", "\n", "combs1" ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'Processor' object has no attribute 'refarr'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrefarr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m: 'Processor' object has no attribute 'refarr'" ] } ], "source": [ "\n", " # write sequences to SAM file\n", " combs1 = glob.glob(os.path.join(\n", " data.tmpdir,\n", " \"{}_tmpcons.*\".format(sample.name)))\n", " combs1.sort(key=lambda x: int(x.split(\".\")[-1]))\n", "\n", " # open sam handle for writing to bam\n", " samfile = sample.files.consens.replace(\".bam\", \".sam\") \n", " with open(samfile, 'w') as outf:\n", "\n", " # parse fai file for writing headers\n", " fai = \"{}.fai\".format(data.paramsdict[\"reference_sequence\"])\n", " fad = pd.read_csv(fai, sep=\"\\t\", names=[\"SN\", \"LN\", \"POS\", \"N1\", \"N2\"])\n", " headers = [\"@HD\\tVN:1.0\\tSO:coordinate\"]\n", " headers += [\n", " \"@SQ\\tSN:{}\\tLN:{}\".format(i, j)\n", " for (i, j) in zip(fad[\"SN\"], fad[\"LN\"])\n", " ]\n", " outf.write(\"\\n\".join(headers) + \"\\n\")\n", "\n", " # write to file with sample names imputed to line up with catg array\n", " counter = 0\n", " for fname in combs1:\n", " with open(fname) as infile:\n", " # impute catg ordered seqnames \n", " data = infile.readlines()\n", " fdata = []\n", " for line in data:\n", " name, chrom, rest = line.rsplit(\":\", 2)\n", " fdata.append(\n", " \"{}_{}:{}:{}\".format(name, counter, chrom, rest)\n", " )\n", " counter += 1\n", " outf.write(\"\".join(fdata) + \"\\n\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "\n", "class Processor:\n", " def __init__(self, data, sample, chunkfile, isref):\n", " self.data = data\n", " self.sample = sample\n", " self.chunkfile = chunkfile\n", " self.isref = isref\n", "\n", " # prepare the processor\n", " self.set_params()\n", " self.init_counters()\n", " self.init_arrays() \n", " self.chroms2ints()\n", "\n", " # run the processor\n", " def run(self):\n", " self.process_chunk()\n", " self.write_chunk()\n", "\n", " def set_params(self):\n", " # set max limits\n", " self.tmpnum = int(self.chunkfile.split(\".\")[-1])\n", " self.optim = int(self.chunkfile.split(\".\")[-2])\n", " self.este = self.data.stats.error_est.mean()\n", " self.esth = self.data.stats.hetero_est.mean()\n", " self.maxlen = self.data._hackersonly[\"max_fragment_length\"]\n", " if 'pair' in self.data.paramsdict[\"datatype\"]:\n", " self.maxhet = sum(self.data.paramsdict[\"max_Hs_consens\"])\n", " self.maxn = sum(self.data.paramsdict[\"max_Ns_consens\"])\n", " else:\n", " self.maxhet = self.data.paramsdict[\"max_Hs_consens\"][0]\n", " self.maxn = self.data.paramsdict[\"max_Ns_consens\"][0]\n", " # not enforced for ref\n", " if self.isref:\n", " self.maxn = int(1e6)\n", " \n", " def init_counters(self):\n", " # store data for stats counters.\n", " self.counters = {\n", " \"name\": self.tmpnum,\n", " \"heteros\": 0,\n", " \"nsites\": 0,\n", " \"nconsens\": 0,\n", " }\n", "\n", " # store data for what got filtered\n", " self.filters = {\n", " \"depth\": 0,\n", " \"maxh\": 0,\n", " \"maxn\": 0,\n", " }\n", "\n", " # store data for writing\n", " self.storeseq = {}\n", "\n", " def init_arrays(self):\n", " # local copies to use to fill the arrays\n", " self.catarr = np.zeros((self.optim, self.maxlen, 4), dtype=np.uint32)\n", " self.nallel = np.zeros((self.optim, ), dtype=np.uint8)\n", " self.refarr = np.zeros((self.optim, 3), dtype=np.int64)\n", "\n", " def chroms2ints(self):\n", " # if reference-mapped then parse the fai to get index number of chroms\n", " if self.isref:\n", " fai = pd.read_csv(\n", " self.data.paramsdict[\"reference_sequence\"] + \".fai\",\n", " names=['scaffold', 'size', 'sumsize', 'a', 'b'],\n", " sep=\"\\t\")\n", " self.faidict = {j: i for i, j in enumerate(fai.scaffold)}\n", " self.revdict = {j: i for i, j in self.faidict.items()}\n", "\n", " # ---------------------------------------------\n", " def process_chunk(self):\n", " # stream through the clusters\n", " with open(self.chunkfile, 'rb') as inclust:\n", " pairdealer = izip(*[iter(inclust)] * 2)\n", " done = 0\n", " while not done:\n", " done, chunk = clustdealer(pairdealer, 1)\n", " if chunk: \n", " self.parse_cluster(chunk)\n", " if self.filter_mindepth():\n", " self.build_consens_and_array()\n", " self.get_heteros()\n", " if self.filter_maxhetero():\n", " if self.filter_maxN_minLen():\n", " self.get_alleles()\n", " self.store_data()\n", "\n", " def parse_cluster(self, chunk):\n", " # get names and seqs\n", " piece = chunk[0].decode().strip().split(\"\\n\")\n", " self.names = piece[0::2]\n", " self.seqs = piece[1::2]\n", "\n", " # pull replicate read info from seqs\n", " self.reps = [int(n.split(\";\")[-2][5:]) for n in self.names]\n", "\n", " # ref positions\n", " self.ref_position = (-1, 0, 0)\n", " if self.isref:\n", " # parse position from name string\n", " rname = self.names[0].rsplit(\";\", 2)[0]\n", " chrom, posish = rname.rsplit(\":\")\n", " pos0, pos1 = posish.split(\"-\")\n", " # pull idx from .fai reference dict\n", " chromint = self.faidict[chrom] + 1\n", " self.ref_position = (int(chromint), int(pos0), int(pos1))\n", "\n", " def filter_mindepth(self):\n", " # exit if nreps is less than mindepth setting\n", " if not nfilter1(self.data, self.reps):\n", " self.filters['depth'] += 1\n", " return 0\n", " return 1\n", "\n", " def build_consens_and_array(self):\n", " # get stacks of base counts\n", " sseqs = [list(seq) for seq in self.seqs]\n", " arrayed = np.concatenate(\n", " [[seq] * rep for (seq, rep) in zip(sseqs, self.reps)]\n", " ).astype(bytes)\n", "\n", " # ! enforce maxlen limit !\n", " self.arrayed = arrayed[:, :self.maxlen]\n", " \n", " # get unphased consens sequence from arrayed\n", " self.consens = base_caller(\n", " self.arrayed, \n", " self.data.paramsdict[\"mindepth_majrule\"], \n", " self.data.paramsdict[\"mindepth_statistical\"],\n", " self.esth, \n", " self.este,\n", " )\n", "\n", " # trim Ns from the left and right ends\n", " self.consens[self.consens == b\"-\"] = b\"N\"\n", " trim = np.where(self.consens != b\"N\")[0]\n", " ltrim, rtrim = trim.min(), trim.max()\n", " self.consens = self.consens[ltrim:rtrim + 1]\n", " self.arrayed = self.arrayed[:, ltrim:rtrim + 1]\n", "\n", " # update position for trimming\n", " self.ref_position = (\n", " self.ref_position[0], \n", " self.ref_position[1] + ltrim,\n", " self.ref_position[1] + ltrim + rtrim + 1,\n", " )\n", "\n", " def get_heteros(self):\n", " self.hidx = [\n", " i for (i, j) in enumerate(self.consens) if \n", " j.decode() in list(\"RKSYWM\")]\n", " self.nheteros = len(self.hidx)\n", "\n", " def filter_maxhetero(self):\n", " if not nfilter2(self.nheteros, self.maxhet):\n", " self.filters['maxh'] += 1\n", " return 0\n", " return 1\n", "\n", " def filter_maxN_minLen(self):\n", " if not nfilter3(self.consens, self.maxn):\n", " self.filters['maxn'] += 1\n", " return 0\n", " return 1\n", "\n", " def get_alleles(self):\n", " # if less than two Hs then there is only one allele\n", " if len(self.hidx) < 2:\n", " self.nalleles = 1\n", " else:\n", " # array of hetero sites\n", " harray = self.arrayed[:, self.hidx]\n", " # remove reads with - or N at variable site\n", " harray = harray[~np.any(harray == b\"-\", axis=1)]\n", " harray = harray[~np.any(harray == b\"N\", axis=1)]\n", " # get counts of each allele (e.g., AT:2, CG:2)\n", " ccx = Counter([tuple(i) for i in harray])\n", " # remove low freq alleles if more than 2, since they may reflect\n", " # seq errors at hetero sites, making a third allele, or a new\n", " # allelic combination that is not real.\n", " if len(ccx) > 2:\n", " totdepth = harray.shape[0]\n", " cutoff = max(1, totdepth // 10)\n", " alleles = [i for i in ccx if ccx[i] > cutoff]\n", " else:\n", " alleles = ccx.keys()\n", " self.nalleles = len(alleles)\n", "\n", " if self.nalleles == 2:\n", " try:\n", " self.consens = storealleles(self.consens, self.hidx, alleles)\n", " except (IndexError, KeyError):\n", " # the H sites do not form good alleles\n", " ip.logger.info(\"failed at phasing loc, skipping\")\n", " ip.logger.info(\"\"\"\n", " consens %s\n", " hidx %s\n", " alleles %s\n", " \"\"\", self.consens, self.hidx, alleles)\n", "\n", " def store_data(self):\n", " # current counter\n", " cidx = self.counters[\"nconsens\"]\n", " self.nallel[cidx] = self.nalleles\n", " self.refarr[cidx] = self.ref_position\n", " # store a reduced array with only CATG\n", " catg = np.array(\n", " [np.sum(self.arrayed == i, axis=0)\n", " for i in [b'C', b'A', b'T', b'G']],\n", " dtype='uint32').T\n", " self.catarr[cidx, :catg.shape[0], :] = catg\n", "\n", " # store the seqdata and advance counters\n", " self.storeseq[cidx] = b\"\".join(list(self.consens))\n", " self.counters[\"name\"] += 1\n", " self.counters[\"nconsens\"] += 1\n", " self.counters[\"heteros\"] += self.nheteros\n", "\n", " # ----------------------------------------------\n", " def write_chunk(self):\n", "\n", " # find last empty size\n", " end = np.where(np.all(self.refarr == 0, axis=1))[0]\n", " if np.any(end):\n", " end = end.min()\n", " else:\n", " end = self.refarr.shape[0]\n", "\n", " # write final consens string chunk\n", " consenshandle = os.path.join(\n", " self.data.tmpdir,\n", " \"{}_tmpcons.{}.{}\".format(self.sample.name, end, self.tmpnum))\n", "\n", " # write chunk. \n", " if self.storeseq:\n", " with open(consenshandle, 'wt') as outfile:\n", "\n", " # denovo just write the consens simple\n", " if not self.isref:\n", " outfile.write(\n", " \"\\n\".join(\n", " [\">\" + self.sample.name + \"_\" + str(key) + \\\n", " \"\\n\" + self.storeseq[key].decode() \n", " for key in self.storeseq]))\n", "\n", " # reference needs to store if the read is revcomp to reference\n", " else:\n", " outfile.write(\n", " \"\\n\".join(\n", " [\"{}:{}:{}-{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\"\n", " .format(\n", " self.sample.name,\n", " self.refarr[i][0],\n", " self.refarr[i][1],\n", " self.refarr[i][2],\n", " 0,\n", " self.revdict[self.refarr[i][0] - 1],\n", " self.refarr[i][1],\n", " 0,\n", "\n", " # why doesn't this line up with +2 ?\n", " # make_indel_cigar(storeseq[i].decode()),\n", " #\"{}M\".format(refarr[i][2] - refarr[i][1]),\n", " \"{}M\".format(len(self.storeseq[i].decode())),\n", " \"*\",\n", " 0,\n", "\n", " ## + 2 here\n", " self.refarr[i][2] - self.refarr[i][1],\n", " #len(self.storeseq[i].decode()),\n", " self.storeseq[i].decode(),\n", " \"*\",\n", " # \"XT:Z:{}\".format(\n", " # make_indel_cigar(storeseq[i].decode())\n", " # ),\n", "\n", " ) for i in self.storeseq.keys()]\n", " ))\n", "\n", " # store reduced size arrays with indexes matching to keep indexes\n", " tmp5 = consenshandle.replace(\"_tmpcons.\", \"_tmpcats.\")\n", " with h5py.File(tmp5, 'w') as io5:\n", " io5.create_dataset(name=\"cats\", data=self.catarr[:end])\n", " io5.create_dataset(name=\"alls\", data=self.nallel[:end])\n", " io5.create_dataset(name=\"chroms\", data=self.refarr[:end])\n", " del self.catarr\n", " del self.nallel\n", " del self.refarr\n", "\n", " # return stats\n", " self.counters['nsites'] = sum([len(i) for i in self.storeseq.values()])\n", " del self.storeseq\n", " #return self.counters, self.filters\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[5, 1]\n", "(1, 3481, 1)\n", "(1, 3481, 3748)\n", "[6]\n", "(1, 30443, 1)\n", "(1, 30443, 30713)\n", "[9, 1, 1]\n", "(1, 33785, 1)\n", "(1, 33785, 34014)\n" ] } ], "source": [ "with open(self.chunkfile, 'rb') as inclust:\n", " pairdealer = izip(*[iter(inclust)] * 2)\n", " for i in range(20):\n", " _, chunk = clustdealer(pairdealer, 1)\n", " if chunk:\n", " self.parse_cluster(chunk)\n", " if self.filter_mindepth():\n", " print(self.reps)\n", " self.build_consens_and_array()\n", " \n", " # get stacks of base counts\n", " sseqs = [list(seq) for seq in self.seqs]\n", " arrayed = np.concatenate(\n", " [[seq] * rep for (seq, rep) in zip(sseqs, self.reps)]\n", " ).astype(bytes)\n", " self.arrayed = arrayed[:, :self.maxlen]\n", " \n", " # get consens call for each site, applies paralog-x-site filter\n", " self.consens = base_caller(\n", " arrayed, \n", " self.data.paramsdict[\"mindepth_majrule\"], \n", " self.data.paramsdict[\"mindepth_statistical\"],\n", " self.esth, \n", " self.este,\n", " )\n", " self.consens[self.consens == b\"-\"] = b\"N\"\n", " trim = np.where(self.consens != b\"N\")[0]\n", " ltrim, rtrim = trim.min(), trim.max()\n", " self.consens = self.consens[ltrim:rtrim + 1]\n", " self.arrayed = self.arrayed[:, ltrim:rtrim + 1]\n", " print(self.ref_position)\n", " \n", " # update position for trimming\n", " self.ref_position = (\n", " self.ref_position[0], \n", " self.ref_position[1] + ltrim,\n", " self.ref_position[1] + ltrim + rtrim + 1,\n", " )\n", " \n", " print(self.ref_position)\n" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([ 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]),)" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# min where it is not N\n", "np.argmin(np.where(arr != \"N\"))\n", "np.where(arr != \"N\")" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "consens = arr\n", "trim = np.where(consens != \"N\")[0]\n", "ltrim, rtrim = trim.min(), trim.max()\n", "consens = consens[ltrim:rtrim + 1]" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'a']\n", "7 18\n" ] } ], "source": [ "print(consens)\n", "print(pos[0] + ltrim, rtrim + 1)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'], dtype='\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[0;31m# write final consens string chunk\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 111\u001b[0m consenshandle = os.path.join(\n\u001b[0;32m--> 112\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtmpdir\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 113\u001b[0m \"{}_tmpcons.{}.{}\".format(sample.name, end, tmpnum))\n\u001b[1;32m 114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mAttributeError\u001b[0m: 'Assembly' object has no attribute 'tmpdir'" ] } ], "source": [ "# find last empty size\n", "end = np.where(np.all(refarr == 0, axis=1))[0]\n", "if np.any(end):\n", " end = end.min()\n", "else:\n", " end = refarr.shape[0]\n", "\n", "# write final consens string chunk\n", "consenshandle = os.path.join(\n", " data.tmpdir,\n", " \"{}_tmpcons.{}.{}\".format(sample.name, end, tmpnum))\n", "\n", "# write chunk. \n", "if storeseq:\n", " with open(consenshandle, 'wt') as outfile:\n", "\n", " # denovo just write the consens simple\n", " if not isref:\n", " outfile.write(\n", " \"\\n\".join([\">\" + sample.name + \"_\" + str(key) + \\\n", " \"\\n\" + storeseq[key].decode() for key in storeseq]))\n", "\n", " # reference needs to store if the read is revcomp to reference\n", " else:\n", " outfile.write(\n", " \"\\n\".join(\n", " [\"{}:{}:{}-{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\"\n", " .format(\n", " sample.name,\n", " refarr[i][0],\n", " refarr[i][1],\n", " refarr[i][2],\n", " 0,\n", " revdict[refarr[i][0] - 1],\n", " refarr[i][1],\n", " 0,\n", "\n", " # why doesn't this line up with +2 ?\n", " # make_indel_cigar(storeseq[i].decode()),\n", " #\"{}M\".format(refarr[i][2] - refarr[i][1]),\n", " \"{}M\".format(len(storeseq[i].decode())),\n", " \"*\",\n", " 0,\n", "\n", " ## + 2 here\n", " refarr[i][2] - refarr[i][1],\n", "\n", " storeseq[i].decode(),\n", " \"*\",\n", " # \"XT:Z:{}\".format(\n", " # make_indel_cigar(storeseq[i].decode())\n", " # ),\n", "\n", " ) for i in storeseq.keys()]\n", " ))\n", "\n", "# store reduced size arrays with indexes matching to keep indexes\n", "tmp5 = consenshandle.replace(\"_tmpcons.\", \"_tmpcats.\")\n", "with h5py.File(tmp5, 'w') as io5:\n", " io5.create_dataset(name=\"cats\", data=catarr[:end])\n", " io5.create_dataset(name=\"alls\", data=nallel[:end])\n", " io5.create_dataset(name=\"chroms\", data=refarr[:end])\n", "del catarr\n", "del nallel\n", "del refarr\n", "\n", "# return stats\n", "counters['nsites'] = sum([len(i) for i in storeseq.values()])\n", "del storeseq\n", "return counters, filters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading Assembly: tortas\n", "from saved path: ~/local/src/ipyrad/tests/6-tortas/tortas.json\n" ] } ], "source": [ "data = ip.load_json(\"6-tortas/tortas.json\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New Assembly: tortas\n", "0 assembly_name tortas \n", "1 project_dir ./6-tortas \n", "2 raw_fastq_path \n", "3 barcodes_path \n", "4 sorted_fastq_path /home/deren/Documents/Tortoise/fastq-concats-GG-d10-118/*.gz\n", "5 assembly_method reference \n", "6 reference_sequence /home/deren/Documents/Tortoise/lgeorge.genome.fa\n", "7 datatype pairddrad \n", "8 restriction_overhang ('CATG', 'AATT') \n", "9 max_low_qual_bases 5 \n", "10 phred_Qscore_offset 33 \n", "11 mindepth_statistical 6 \n", "12 mindepth_majrule 6 \n", "13 maxdepth 10000 \n", "14 clust_threshold 0.85 \n", "15 max_barcode_mismatch 0 \n", "16 filter_adapters 0 \n", "17 filter_min_trim_len 35 \n", "18 max_alleles_consens 2 \n", "19 max_Ns_consens (5, 5) \n", "20 max_Hs_consens (8, 8) \n", "21 min_samples_locus 4 \n", "22 max_SNPs_locus (20, 20) \n", "23 max_Indels_locus (8, 8) \n", "24 max_shared_Hs_locus 0.5 \n", "25 trim_reads (0, 0, 0, 0) \n", "26 trim_loci (0, 0, 0, 0) \n", "27 output_formats ['p', 's', 'v'] \n", "28 pop_assign_file \n" ] } ], "source": [ "data = ip.Assembly(\"tortas\")\n", "data.set_params(\"project_dir\", \"6-tortas\")\n", "data.set_params(\"sorted_fastq_path\", \"/home/deren/Documents/Tortoise/fastq-concats-GG-d10-118/*.gz\")\n", "data.set_params(\"restriction_overhang\", (\"CATG\", \"AATT\"))\n", "data.set_params(\"datatype\", \"pairddrad\")\n", "data.set_params(\"assembly_method\", \"reference\")\n", "data.set_params(\"reference_sequence\", \"/home/deren/Documents/Tortoise/lgeorge.genome.fa\")\n", "data.get_params()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "data.ipcluster['threads'] = 4" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: tortas\n", "[####################] 100% 1:22:46 | processing reads | s2 |\n" ] } ], "source": [ "data.run(\"2\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statereads_rawreads_passed_filter
AGO02concat21105029411010349
AGO08concat21340840113343190
AGO09concat21565012715593137
AGO11concat21284893612766537
AGO12concat21259053112563936
\n", "
" ], "text/plain": [ " state reads_raw reads_passed_filter\n", "AGO02concat 2 11050294 11010349\n", "AGO08concat 2 13408401 13343190\n", "AGO09concat 2 15650127 15593137\n", "AGO11concat 2 12848936 12766537\n", "AGO12concat 2 12590531 12563936" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.stats.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.clustmap import *\n", "self = Step3(data, 8, True, ipyclient)\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | indexing reference | s3 |\n", "[####################] 100% 0:00:05 | concatenating | s3 |\n", "[####################] 100% 1:31:45 | join unmerged pairs | s3 |\n", "[####################] 100% 1:34:21 | dereplicating | s3 |\n", "[####################] 100% 0:45:59 | splitting dereps | s3 |\n", "[####################] 100% 6:06:27 | mapping reads | s3 |\n", "[####################] 100% 4:20:31 | building clusters | s3 |\n", "[####################] 100% 0:01:17 | calc cluster stats | s3 |\n" ] } ], "source": [ "self.run()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "test = self.data.branch('test', [i.name for i in self.samples[:4]])" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: test\n", "[####################] 100% 0:10:21 | inferring [H, E] | s4 |\n", "[####################] 100% 0:00:16 | calculating depths | s5 |\n", "[####################] 100% 0:00:29 | chunking clusters | s5 |\n", "[####################] 100% 0:14:46 | consens calling | s5 |\n", "[####################] 100% 0:01:30 | indexing alleles | s5 |\n" ] } ], "source": [ "test.run('45')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading Assembly: test\n", "from saved path: ~/local/src/ipyrad/tests/6-tortas/test.json\n" ] } ], "source": [ "test = ip.load_json(\"6-tortas/test.json\")\n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: test\n", "[####################] 100% 0:00:15 | calculating depths | s5 |\n", "[####################] 100% 0:00:27 | chunking clusters | s5 |\n", "[####################] 100% 0:14:45 | consens calling | s5 |\n", "[####################] 100% 0:01:30 | indexing alleles | s5 |\n" ] } ], "source": [ "test.run(\"5\", force=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "data = test\n", "samples = list(data.samples.values())\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.cluster_across import *\n", "self = Step6(data, True, ipyclient)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:54 | concatenating bams | s6 |\n" ] } ], "source": [ "self.remote_concat_bams()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:13 | building clusters | s6 |\n" ] } ], "source": [ "self.remote_build_ref_regions()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "49" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# prepare i/o\n", "bamfile = AlignmentFile(\n", " os.path.join(\n", " self.data.dirs.across,\n", " \"{}.cat.sorted.bam\".format(self.data.name)),\n", " 'rb')\n", "outfile = gzip.open(\n", " os.path.join(\n", " self.data.dirs.across,\n", " \"{}.catcons.gz\".format(self.data.name)),\n", " 'wb')\n", "\n", "# write header with sample names\n", "outfile.write(\n", " b\" \".join([i.name.encode() for i in self.samples]) + b\"\\n\") " ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "def remote_build_ref_clusters(self):\n", " \"build clusters and find variants/indels to store\"\n", "\n", " # prepare i/o\n", " bamfile = AlignmentFile(\n", " os.path.join(\n", " self.data.dirs.across,\n", " \"{}.cat.sorted.bam\".format(self.data.name)),\n", " 'rb')\n", " outfile = gzip.open(\n", " os.path.join(\n", " self.data.dirs.across,\n", " \"{}.catcons.gz\".format(self.data.name)),\n", " 'wb')\n", "\n", " # write header with sample names\n", " snames = sorted([i.name for i in self.samples])\n", " nsamples = len(snames)\n", " outfile.write(\n", " b\" \".join([i.encode() for i in snames]) + b\"\\n\")\n", "\n", " # get clusters\n", " iregions = iter(self.regions)\n", " clusts = []\n", " while 1:\n", " # pull in the cluster\n", " try:\n", " region = next(iregions)\n", " reads = bamfile.fetch(*region)\n", " except StopIteration:\n", " break\n", "\n", " # pull in the reference for this region\n", " refn, refs = get_ref_region(\n", " data.paramsdict[\"reference_sequence\"], \n", " region[0], region[1] + 1, region[2] + 1,\n", " )\n", "\n", " # build cluster dict for sorting \n", " rdict = {}\n", " for read in reads:\n", " rdict[read.qname] = read.seq \n", " keys = sorted(rdict.keys(), key=lambda x: x.rsplit(\":\", 2)[0])\n", "\n", " # make empty array\n", " arr = np.zeros((nsamples + 1, len(refs)), dtype=bytes)\n", "\n", " # fill it\n", " arr[0] = list(refs)\n", " for idx, key in enumerate(keys):\n", " # get start and stop from name\n", " sname = key.rsplit(\"_\", 1)[0]\n", " rstart, rstop = key.rsplit(\":\", 2)[-1].split(\"-\")\n", " sidx = snames.index(sname)\n", "\n", " # get start relative to ref\n", " start = int(rstart) - int(region[1]) - 1\n", " stop = start + int(rstop) - int(rstart)\n", " print(sidx + 1, start, stop, arr.shape[1])\n", " try:\n", " arr[sidx + 1, int(start): int(stop)] = list(rdict[key])\n", " except ValueError:\n", " print(rdict[key])\n", "\n", " # get consens seq and variant site index \n", " print(\"\")\n", " clust = []\n", " avars = refvars(arr.view(np.uint8), PSEUDO_REF)\n", " dat = b\"\".join(avars.view(\"S1\")[:, 0]).decode()\n", " snpstring = \"\".join([\"*\" if i else \" \" for i in avars[:, 1]])\n", " clust.append(\"ref_{}:{}-{}\\n{}\".format(*region, dat))\n", "\n", " # or, just build variant string (or don't...)\n", " # write all loci with [locids, nsnps, npis, nindels, ?]\n", " # for key in keys:\n", " # clust.append(\"{}\\n{}\".format(key, rdict[key]))\n", " # clust.append(\"SNPs\\n\" + snpstring)\n", " # clusts.append(\"\\n\".join(clust))\n", "\n", " outfile.write(str.encode(\"\\n//\\n//\\n\".join(clusts) + \"\\n//\\n//\\n\"))\n", " outfile.close()" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PZ03concat_10:1:67768-68840\n", "PZ03concat 960 0 1072 1301\n", "CATGAGATGCTAACGAGGTCTCTCAAAGCTCCAGGCCCAGCTGACTGACAGCTTTGCATACTGTTGTTCTTNNNNNNNNNNNNNNNNNNNNTTAGCACCTTTCGCTCAAGGGCTTCTAATGCTCCCCTCNCCTCCCCTCAATAATGATACTTAGCACTTGATGAATTNNNNNNNNNNNNNNNNNNNNAATTAAATAAGCCTTACCTGCTGTGAAATGTCAATACTATTTTGAAGATGGGGAAACCGAGGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGTTCTCAGGGGTCCAGCAACCTTCAAGCCCTTGTTCTCAAGATCCCCTTTGGAATAAGACCTCGTCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAGTCCAGTCCCCTGCCCTCATGGTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTGGTCCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n" ] }, { "data": { "text/plain": [ "array([[b'C', b'A', b'T', b'G', b'C', b'T', b'T', b'C', b'A', b'C', b'C',\n", " b'A', b'G', b'C', b'C', b'C', b'A', b'T', b'A', b'G', b'C', b'G',\n", " b'G', b'T', b'G', b'G', b'G', b'C', b'T', b'C', b'T', b'G', b'T',\n", " b'G', b'C', b'C', b'C', b'C', b'C', b'C', b'A', b'C', b'T', b'C',\n", " b'T', b'G', b'C', b'C', b'C', b'A', b'C', b'A', b'G', b'G', b'T',\n", " b'A', b'G', b'G', b'C', b'T', b'G', b'G', b'G', b'C', b'T', b'G',\n", " b'G', b'C', b'T', b'G', b'T', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'C', b'C', b'C', b'G', b'G', b'C', b'A', b'G', b'G',\n", " b'C', b'T', b'T', b'G', b'G', b'A', b'C', b'A', b'A', b'A', b'G',\n", " b'A', b'C', b'G', b'T', b'T', b'A', b'A', b'G', b'T', b'T', b'T',\n", " b'T', b'C', b'A', b'A', b'G', b'C', b'C', b'C', b'C', b'A', b'A',\n", " b'A', b'T', b'G', b'T', b'T', b'T', b'T', b'T', b'G', b'G', b'A',\n", " b'G', b'G', b'A', b'G', b'A', b'T', b'G', b'G', b'T', b'T', b'C',\n", " b'C', b'C', b'C', b'C', b'C', b'T', b'T', b'A', b'A', b'A', b'T',\n", " b'T'],\n", " [b'C', b'A', b'T', b'G', b'C', b'T', b'T', b'C', b'A', b'C', b'C',\n", " b'A', b'G', b'C', b'C', b'C', b'A', b'T', b'A', b'G', b'C', b'G',\n", " b'G', b'T', b'G', b'G', b'G', b'C', b'T', b'C', b'T', b'G', b'T',\n", " b'G', b'C', b'C', b'C', b'C', b'C', b'C', b'A', b'C', b'T', b'C',\n", " b'T', b'G', b'C', b'C', b'C', b'A', b'C', b'A', b'G', b'G', b'T',\n", " b'A', b'G', b'G', b'C', b'T', b'G', b'G', b'G', b'C', b'T', b'G',\n", " b'G', b'C', b'T', b'G', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'C', b'C', b'C', b'G', b'G', b'C', b'A', b'G', b'G',\n", " b'C', b'T', b'T', b'G', b'G', b'A', b'C', b'A', b'A', b'A', b'G',\n", " b'A', b'C', b'G', b'T', b'T', b'A', b'A', b'G', b'T', b'T', b'T',\n", " b'T', b'C', b'A', b'A', b'G', b'C', b'C', b'C', b'C', b'A', b'A',\n", " b'A', b'T', b'G', b'T', b'T', b'T', b'T', b'T', b'G', b'G', b'A',\n", " b'G', b'G', b'A', b'G', b'A', b'T', b'G', b'G', b'T', b'T', b'C',\n", " b'C', b'C', b'C', b'C', b'C', b'T', b'T', b'A', b'A', b'A', b'T',\n", " b'T'],\n", " [b'C', b'A', b'T', b'G', b'C', b'T', b'T', b'C', b'A', b'C', b'C',\n", " b'A', b'G', b'C', b'C', b'C', b'A', b'T', b'A', b'G', b'C', b'G',\n", " b'G', b'T', b'G', b'G', b'G', b'C', b'T', b'C', b'T', b'G', b'T',\n", " b'G', b'C', b'C', b'C', b'C', b'C', b'C', b'A', b'C', b'T', b'C',\n", " b'T', b'G', b'C', b'C', b'C', b'A', b'C', b'A', b'G', b'G', b'T',\n", " b'A', b'G', b'G', b'C', b'T', b'G', b'G', b'G', b'C', b'T', b'G',\n", " b'G', b'C', b'T', b'G', b'T', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'C', b'C', b'C', b'G', b'G', b'C', b'A', b'G', b'G',\n", " b'C', b'T', b'T', b'G', b'G', b'A', b'C', b'A', b'A', b'A', b'G',\n", " b'A', b'C', b'G', b'T', b'T', b'A', b'A', b'G', b'T', b'T', b'T',\n", " b'T', b'C', b'A', b'A', b'G', b'C', b'C', b'C', b'C', b'A', b'A',\n", " b'A', b'T', b'G', b'T', b'T', b'T', b'T', b'T', b'G', b'G', b'A',\n", " b'G', b'G', b'A', b'G', b'A', b'T', b'G', b'G', b'T', b'T', b'C',\n", " b'C', b'C', b'C', b'C', b'C', b'T', b'T', b'A', b'A', b'A', b'T',\n", " b'T'],\n", " [b'C', b'A', b'T', b'G', b'C', b'T', b'T', b'C', b'A', b'C', b'C',\n", " b'A', b'G', b'C', b'C', b'C', b'A', b'T', b'A', b'G', b'C', b'G',\n", " b'G', b'T', b'G', b'G', b'G', b'C', b'T', b'C', b'T', b'G', b'T',\n", " b'G', b'C', b'C', b'C', b'C', b'C', b'C', b'A', b'C', b'T', b'C',\n", " b'T', b'G', b'C', b'C', b'C', b'A', b'C', b'A', b'G', b'G', b'T',\n", " b'A', b'G', b'G', b'C', b'T', b'G', b'G', b'G', b'C', b'T', b'G',\n", " b'G', b'C', b'T', b'G', b'T', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',\n", " b'N', b'N', b'C', b'C', b'C', b'G', b'G', b'C', b'A', b'G', b'G',\n", " b'C', b'T', b'T', b'G', b'G', b'A', b'C', b'A', b'A', b'A', b'G',\n", " b'A', b'C', b'G', b'T', b'T', b'A', b'A', b'G', b'T', b'T', b'T',\n", " b'T', b'C', b'A', b'A', b'G', b'C', b'C', b'C', b'C', b'A', b'A',\n", " b'A', b'T', b'G', b'T', b'T', b'T', b'T', b'T', b'G', b'G', b'A',\n", " b'G', b'G', b'A', b'G', b'A', b'T', b'G', b'G', b'T', b'T', b'C',\n", " b'C', b'C', b'C', b'C', b'C', b'T', b'T', b'A', b'A', b'A', b'T',\n", " b'T']], dtype='|S1')" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iregions = iter(self.regions[:50])\n", "\n", "def build_ref_clusters2(self):\n", " \"build clusters and find variants/indels to store\"\n", " # prepare i/o\n", " bamfile = AlignmentFile(\n", " os.path.join(\n", " self.data.dirs.across,\n", " \"{}.cat.sorted.bam\".format(self.data.name)),\n", " 'rb')\n", " outfile = gzip.open(\n", " os.path.join(\n", " self.data.dirs.across,\n", " \"{}.catcons.gz\".format(self.data.name)),\n", " 'wb')\n", "\n", " # write header with sample names\n", " snames = sorted([i.name for i in self.samples])\n", " nsamples = len(snames)\n", " outfile.write(\n", " b\" \".join([i.encode() for i in snames]) + b\"\\n\")\n", "\n", " # get clusters\n", " clusts = []\n", " while 1:\n", " # pull in the cluster\n", " try:\n", " region = next(iregions)\n", " reads = bamfile.fetch(*region)\n", " except StopIteration:\n", " break\n", "\n", " # pull in the reference for this region\n", " refn, refs = get_ref_region(\n", " data.paramsdict[\"reference_sequence\"], \n", " region[0], region[1] + 1, region[2] + 1,\n", " )\n", "\n", " # build cluster dict for sorting \n", " rdict = {}\n", " for read in reads:\n", " rdict[read.qname] = read.seq \n", " keys = sorted(rdict.keys(), key=lambda x: x.rsplit(\":\", 2)[0])\n", "\n", " # make empty array\n", " arr = np.zeros((len(rdict), len(refs)), dtype=bytes)\n", " \n", " # fill it\n", " arr[0] = list(refs)\n", " for idx, key in enumerate(keys):\n", " # get start and stop from name\n", " sname = key.rsplit(\"_\", 1)[0]\n", " rstart, rstop = key.rsplit(\":\", 2)[-1].split(\"-\")\n", "\n", " # get start relative to ref\n", " start = int(rstart) - int(region[1]) - 1\n", " stop = start + int(rstop) - int(rstart)\n", " try:\n", " sequence = list(rdict[key])\n", " arr[idx, int(start): int(stop)] = sequence\n", " except ValueError:\n", " print(key)\n", " print(sname, len(sequence), start, stop, arr.shape[1])\n", " print(rdict[key]) \n", " \n", " \n", " \n", " \n", " return arr\n", " \n", " \n", "build_ref_clusters2(self)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "\n", " # get consens seq and variant site index \n", " print(\"\")\n", " clust = []\n", " avars = refvars(arr.view(np.uint8), PSEUDO_REF)\n", " dat = b\"\".join(avars.view(\"S1\")[:, 0]).decode()\n", " snpstring = \"\".join([\"*\" if i else \" \" for i in avars[:, 1]])\n", " clust.append(\"ref_{}:{}-{}\\n{}\".format(*region, dat))\n", "\n", " # or, just build variant string (or don't...)\n", " # write all loci with [locids, nsnps, npis, nindels, ?]\n", " # for key in keys:\n", " # clust.append(\"{}\\n{}\".format(key, rdict[key]))\n", " # clust.append(\"SNPs\\n\" + snpstring)\n", " # clusts.append(\"\\n\".join(clust))\n", "\n", " outfile.write(str.encode(\"\\n//\\n//\\n\".join(clusts) + \"\\n//\\n//\\n\"))\n", " outfile.close()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 0 234 234\n", "2 0 234 234\n", "3 0 234 234\n", "4 0 234 234\n", "\n", "1 0 919 919\n", "2 0 267 919\n", "3 0 919 919\n", "4 0 261 919\n", "\n", "1 201 462 462\n", "2 201 462 462\n", "3 201 462 462\n", "4 0 462 462\n", "\n", "1 0 237 237\n", "2 0 237 237\n", "3 0 237 237\n", "4 0 237 237\n", "\n", "1 0 238 238\n", "2 0 238 238\n", "3 0 238 238\n", "4 0 238 238\n", "\n", "1 0 590 960\n", "2 0 590 960\n", "3 0 1054 960\n", "4 0 636 960\n", "\n", "1 0 257 257\n", "\n", "2 0 248 248\n", "\n", "1 0 887 887\n", "2 78 289 887\n", "\n", "1 0 434 623\n", "2 222 623 623\n", "\n", "1 0 510 674\n", "2 271 510 674\n", "3 271 510 674\n", "4 271 674 674\n", "\n", "3 0 328 328\n", "\n", "1 244 494 649\n", "2 0 494 649\n", "3 0 649 649\n", "4 0 494 649\n", "\n", "1 0 545 545\n", "2 0 545 545\n", "3 0 545 545\n", "4 0 545 545\n", "\n", "1 254 501 1012\n", "2 254 1012 1012\n", "4 0 501 1012\n", "\n", "2 0 202 202\n", "\n", "1 0 827 827\n", "\n", "1 0 243 243\n", "2 0 243 243\n", "3 0 243 243\n", "4 0 243 243\n", "\n", "1 0 378 378\n", "2 163 378 378\n", "\n", "1 0 223 223\n", "2 0 223 223\n", "3 0 223 223\n", "4 0 223 223\n", "\n", "1 0 562 562\n", "2 0 228 562\n", "3 0 228 562\n", "4 0 228 562\n", "\n", "1 0 254 254\n", "2 0 254 254\n", "3 0 254 254\n", "4 0 254 254\n", "\n", "4 0 259 259\n", "\n", "1 0 468 468\n", "2 58 273 468\n", "3 0 273 468\n", "\n", "1 0 775 775\n", "2 343 578 775\n", "3 0 578 775\n", "4 343 578 775\n", "\n", "1 0 211 211\n", "3 0 211 211\n", "\n", "1 0 228 228\n", "2 0 228 228\n", "3 0 228 228\n", "4 0 228 228\n", "\n", "4 0 854 854\n", "\n", "1 0 238 238\n", "2 0 238 238\n", "3 0 238 238\n", "4 0 238 238\n", "\n", "1 274 765 765\n", "2 0 765 765\n", "3 0 765 765\n", "4 274 765 765\n", "\n", "1 0 219 219\n", "2 1 219 219\n", "3 1 219 219\n", "4 1 219 219\n", "\n", "1 180 422 422\n", "2 0 422 422\n", "3 180 422 422\n", "4 180 422 422\n", "\n", "1 0 231 231\n", "2 0 231 231\n", "3 0 231 231\n", "4 0 231 231\n", "\n", "1 44 288 695\n", "2 44 288 695\n", "3 0 695 695\n", "4 44 695 695\n", "\n", "1 0 251 251\n", "2 0 251 251\n", "3 0 251 251\n", "4 0 251 251\n", "\n", "1 0 242 552\n", "2 0 242 552\n", "3 0 552 552\n", "4 0 242 552\n", "\n", "1 0 258 258\n", "2 0 258 258\n", "3 0 258 258\n", "4 0 258 258\n", "\n", "1 0 583 868\n", "2 376 868 868\n", "3 376 583 868\n", "\n", "4 0 874 874\n", "\n", "2 0 459 539\n", "3 0 459 539\n", "4 0 539 539\n", "\n", "1 118 383 383\n", "2 118 383 383\n", "4 0 383 383\n", "\n", "1 274 737 737\n", "2 0 737 737\n", "3 274 737 737\n", "4 0 737 737\n", "\n", "1 0 249 249\n", "2 0 249 249\n", "3 4 249 249\n", "4 0 249 249\n", "\n", "1 392 858 858\n", "3 0 801 858\n", "\n", "1 2 219 219\n", "2 0 219 219\n", "\n", "2 502 992 992\n", "3 0 760 992\n", "4 502 760 992\n", "\n", "2 0 207 207\n", "\n", "3 0 448 448\n", "\n", "1 0 481 481\n", "2 0 215 481\n", "3 0 215 481\n", "\n", "1 0 536 536\n", "2 276 536 536\n", "3 0 536 536\n", "4 0 536 536\n", "\n", "1 0 631 769\n", "2 0 631 769\n", "3 0 631 769\n", "4 0 769 769\n", "\n", "2 0 569 842\n", "3 358 842 842\n", "4 0 842 842\n", "\n", "1 139 620 620\n", "2 356 620 620\n", "3 0 620 620\n", "4 0 620 620\n", "\n", "1 216 471 471\n", "2 0 471 471\n", "3 0 471 471\n", "4 216 471 471\n", "\n", "3 0 215 215\n", "\n", "1 145 367 598\n", "2 145 598 598\n", "3 0 367 598\n", "4 145 598 598\n", "\n", "1 0 223 223\n", "2 0 223 223\n", "3 0 223 223\n", "4 0 223 223\n", "\n", "3 0 210 210\n", "\n", "4 0 476 476\n", "\n", "1 272 505 690\n", "2 272 690 690\n", "3 157 505 690\n", "4 0 505 690\n", "\n", "1 247 532 532\n", "2 0 460 532\n", "3 247 461 532\n", "\n", "1 0 217 389\n", "2 0 389 389\n", "3 0 389 389\n", "\n", "4 0 1118 960\n", "\n", "1 0 514 514\n", "2 268 514 514\n", "3 268 514 514\n", "4 268 514 514\n", "\n", "4 0 270 270\n", "\n", "1 142 398 748\n", "2 142 398 748\n", "3 0 748 748\n", "4 142 398 748\n", "\n", "3 0 546 546\n", "\n", "2 0 258 258\n", "4 0 258 258\n", "\n", "1 0 225 225\n", "2 0 225 225\n", "3 0 225 225\n", "4 0 225 225\n", "\n", "1 1047 1689 1929\n", "2 338 1262 1929\n", "2 1469 1929 1929\n", "3 1046 1689 1929\n", "4 0 820 1929\n", "4 1047 1689 1929\n", "\n", "1 136 355 607\n", "2 136 355 607\n", "3 136 607 607\n", "4 0 355 607\n", "\n", "1 0 509 509\n", "2 0 415 509\n", "3 0 509 509\n", "4 0 247 509\n", "\n", "1 0 355 355\n", "\n", "1 0 490 491\n", "2 258 491 491\n", "3 0 490 491\n", "4 258 490 491\n", "\n", "4 0 835 835\n", "\n", "3 0 227 227\n", "\n", "1 0 214 214\n", "2 0 214 214\n", "3 0 214 214\n", "\n", "1 431 1061 1195\n", "2 235 1235 1195\n", "3 431 1061 1195\n", "4 0 1235 1195\n", "CATGGAGTTGATGTGCCAAGGACAGATCAGAAACCAAATAGAGGGATAAAGAGAGTCCCTTTACAACTTGTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGACAAGGCTCTTCACTTTAACGCTCAGGCTTTGTTTACACTGGGAGATGGTGTTCCAACACTGGCAGAAAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGGGGGCTTGACTCCTGTTTGTGCCTCTGAAAATCTCTCCTTTGGTGCTCGATGCTGTGCTTCTTACCACTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTTGTCAGCAGCAGAAGTGAATCTTAGCAGCAGGCATAAACATATTTGCCTCTTCCGTTCCCAAGAAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTACATATCAATGGCTTTTCTAATGGCCAGTGCTGCAAACTTCACTTGGGATCTGAAAATCAGCTGGGTTCTCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n", "\n", "4 0 762 762\n", "\n", "1 0 510 510\n", "2 0 383 510\n", "3 0 383 510\n", "4 140 383 510\n", "\n", "1 450 677 677\n", "2 450 677 677\n", "3 0 677 677\n", "4 450 677 677\n", "\n", "1 185 518 518\n", "2 285 518 518\n", "3 285 518 518\n", "4 0 518 518\n", "\n", "1 0 510 644\n", "2 0 644 644\n", "3 0 510 644\n", "4 0 510 644\n", "\n", "2 0 590 590\n", "4 0 590 590\n", "\n", "1 0 222 222\n", "2 0 222 222\n", "3 0 222 222\n", "4 0 222 222\n", "\n", "1 0 507 507\n", "2 0 507 507\n", "3 0 507 507\n", "4 0 507 507\n", "\n", "1 0 551 551\n", "\n", "1 0 442 442\n", "4 191 442 442\n", "\n", "1 296 553 553\n", "2 296 553 553\n", "3 296 553 553\n", "4 0 553 553\n", "\n", "1 0 438 438\n", "2 0 258 438\n", "3 0 258 438\n", "4 0 438 438\n", "\n", "1 0 352 767\n", "2 109 767 767\n", "3 109 352 767\n", "4 109 352 767\n", "\n", "1 0 240 241\n", "2 0 241 241\n", "3 0 240 241\n", "4 0 240 241\n", "\n", "3 0 416 416\n", "\n", "1 0 516 516\n", "2 0 227 516\n", "3 0 227 516\n", "4 0 227 516\n", "\n", "2 0 327 327\n", "\n", "1 0 892 892\n", "2 278 892 892\n", "3 186 723 892\n", "4 278 892 892\n", "\n", "1 0 470 470\n", "2 0 470 470\n", "3 0 470 470\n", "4 0 470 470\n", "\n", "1 0 1205 1383\n", "CATGAGAGGTTATGCAGATGGAATGTGAAAATGATCAATAAAGAGAGTTGAAGTATTCCAGGCACCTGCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGAGTTTCCTAGCAAAGGGCTGCTCATTTCATCCTGTACCTAGGCTGTCTCACTGCCTCCTGCTAGGGATTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGTAGACTTTGTCTCCAAAGCAGCTTGTTTTGTTCATTCAATGAAAAGGTGCCACGTTCTCTTCCCTAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGATCTGCAAAGGGAAAAAAAAAAAGGGGGAGCTATGGGGGGTTAGCATCCTTGCGGGAGATAGATAAGTACTTAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTCTGTCAGTCTAGCTCCTGCTACTCAAGGAGGTGGATTACCTACCTCAATAGGAGAACCTCCTCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n", "2 99 643 1383\n", "2 975 1205 1383\n", "3 424 1205 1383\n", "4 99 643 1383\n", "4 975 1383 1383\n", "\n", "1 0 255 410\n", "2 0 255 410\n", "3 0 255 410\n", "4 0 410 410\n", "\n", "1 0 249 1239\n", "1 640 1239 1239\n", "2 0 249 1239\n", "2 640 1239 1239\n", "3 0 1239 1239\n", "CATGTCATTTTCTTTGAGTTTTTCCCCTTTCACCTGGAAACTATCAAACCCTTGAAACTCTGGGTTGTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATTGTTCTGACCATCTTCCTGCGGCTGAGGCATCTCAAGAAACTGAGAAACGTGAGGTGCGTTAGGGACCAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGGTCATTGCCCATTCTATGCGATATATCCGGGGATCGAAAAAGCACCCACAAGGTGCCATCTGGAAACCTGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAGCCAAGACCCGCCAAAGGCCAGATCCTCAGCTGGTGTAAATTGGTGGTGTAGTTCCGGTGACTTCAATGGGGCTGAGCCACTGTACACCAGCTAAGGGTCCAGCACAAAGGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAGCGTCACTTGCACCGAACTGGGATTTGGGGCAAATGCAGTCTGGAGTCAGAGGGTGCAGGTTCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n", "4 0 1239 1239\n", "CATGTCATTTTCTTTGAGTTTTTCCCCTTTCACCTGGAAACTATCAAACCCTTGAAACTCTGGGTTGTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATTGTTCTGACCATCTTCCTGCGGCTGAGGCATCTCAAGAAACTGAGAAACGTGAGGTGCGTTAGGGACCAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGATGTATTGGGGGTTGCTTGGGACTGGGTGGTAAGGTGGGTAGGGGACTGTTTGGACAGGACTCTTGANNNNNNCGTGGGTGTCCAGAAGGTAAGCGCTTGGTGAACTTGATCCACCCGTGAGTTTATAAACTGACGGTTGCACGAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGGTGGTGTAGTTCCGGTGACTTCAATGGGGCTGAGCCACTGTACACCAGCTAAGGGTCCAGCACAAAGGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAGCGTCACTTGCACCGAACTGGGATTTGGGGCAAATGCAGTCTGGAGTCAGAGGGTGCAGGTTCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n", "\n", "1 0 254 254\n", "2 0 254 254\n", "3 0 254 254\n", "4 0 254 254\n", "\n", "2 0 511 511\n", "4 0 511 511\n", "\n", "1 584 1015 1015\n", "2 584 1015 1015\n", "3 0 1016 1015\n", "CATGTCTGAGTCTGTATCTCCTGAAGATGGAGCTATCCAGTTCCCTGCTACATATGTGCCAGTTCAACTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCCATTAACATACATAGAGAGACCTAGGCATCCCCTAGTATTATTTCAGGTAGAAGCAACATACAACTGTTGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGTCCTAAGGGAGGCGAAAGGCCAAGTTAAAGGATGCCCAACTAGGAGAGGAGCAATTNNNNNNNNNNNNNNNNCATGGGAGGGGAGCGGCTGCAATGATGTCTGGAAAGAAGGGGAACAAAAGGTCTTGTGGTTTGGATGCAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGATCTCCAGATCTGTGTGAACTTCAGTTTTAGGAGCTGTGAACCTCTTGAGCGATACTTGTAGGAGATTTCAAATTACCCCTCTGCTGTGTAATAGGGATGGAAGAAGCCTCCAGACCAAAGTAGTTTAGCACAAGCGGATGCCCCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNANAAAACNCTGCCCTG\n", "4 584 1015 1015\n", "\n", "1 0 234 239\n", "2 0 234 239\n", "3 0 239 239\n", "4 0 239 239\n", "\n", "1 272 541 541\n", "2 0 541 541\n", "3 316 541 541\n", "4 0 541 541\n", "\n", "1 0 217 217\n", "2 0 217 217\n", "3 0 217 217\n", "\n", "1 361 582 788\n", "2 361 582 788\n", "3 0 788 788\n", "4 361 582 788\n", "\n", "1 0 558 558\n", "\n", "1 468 722 722\n", "2 468 722 722\n", "3 0 722 722\n", "4 468 722 722\n", "\n", "2 0 214 215\n", "3 0 214 215\n", "4 0 215 215\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1 0 230 230\n", "2 0 230 230\n", "3 0 230 230\n", "4 0 230 230\n", "\n", "1 0 427 427\n", "2 0 427 427\n", "3 0 427 427\n", "4 0 427 427\n", "\n", "1 0 847 848\n", "2 134 847 848\n", "3 0 363 848\n", "3 625 848 848\n", "4 134 363 848\n", "4 625 847 848\n", "\n", "1 0 247 247\n", "2 0 247 247\n", "3 0 247 247\n", "4 0 247 247\n", "\n", "1 0 217 736\n", "1 496 736 736\n", "2 0 217 736\n", "2 496 736 736\n", "3 0 736 736\n", "4 0 217 736\n", "4 496 736 736\n", "\n", "1 0 305 451\n", "2 0 451 451\n", "4 0 451 451\n", "\n", "1 0 244 244\n", "2 0 244 244\n", "3 0 244 244\n", "4 0 244 244\n", "\n", "1 0 265 265\n", "4 0 265 265\n", "\n", "1 116 390 390\n", "2 0 341 390\n", "3 116 341 390\n", "4 116 341 390\n", "\n", "1 0 237 237\n", "2 0 237 237\n", "3 0 237 237\n", "4 0 237 237\n", "\n", "1 231 487 487\n", "2 231 487 487\n", "3 231 487 487\n", "4 0 487 487\n", "\n", "1 0 223 223\n", "2 0 223 223\n", "3 0 223 223\n", "4 0 223 223\n", "\n", "1 0 226 226\n", "2 0 226 226\n", "4 0 226 226\n", "\n", "1 305 528 528\n", "2 305 528 528\n", "3 0 528 528\n", "4 305 528 528\n", "\n", "4 0 263 263\n", "\n", "1 0 583 583\n", "2 94 583 583\n", "3 94 307 583\n", "\n", "1 612 1142 1142\n", "2 337 1142 1142\n", "3 0 1142 1142\n", "CATGGGAAGCTTTGATCCCTTTCAGGAATCCTTAAGGGTGTGACATTTGAAGCTCTTCCAGAGACAGGATGNNNNNNNNNNNNNNNNNTGGTAAAGCCGTAAATCTTGGAAGCTCCGTGTTTGAGCCAAATCTACCCTAGGATGAGATGTAGCCTGCAGCAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGCCTCCGGAGGTTGTGGAATCTCCATCTCTGGAGATATTTAAGAGTAGTTTAGATAACTGTCTATCTGGGATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATCTATGAATCCTTTTATGTCTCCACTATCTGTGGTGGAGCAATTTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGAAGCTGGTTAGTTTGTTTTATTCCATTTCCTTCACTGCTGGGTTCGCCTCCTTCATCTCAGCAGCCTGGAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTTGATTTCCTTCACTTCTCAGCATTTCCCCTCTTGTTAGGACCTGATTCTCTGCAATCCTGTGCGTCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGATGCAAGGCAAGGGAGAATCAGGCCAATGGAG\n", "4 612 1142 1142\n", "\n", "4 0 256 256\n", "\n", "1 276 807 807\n", "2 276 807 807\n", "3 276 807 807\n", "4 0 807 807\n", "\n", "2 0 212 212\n", "\n", "1 0 484 675\n", "2 0 484 675\n", "3 0 675 675\n", "4 0 484 675\n", "\n", "2 0 1255 1072\n", "AATTCTTTGAGCTGCTGTTGATAAATCAGCCTGGAAGAGGTGGAACGTTGTGAAGATTCCCGGGGTACCCAGCGTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCTATGCAGGTGAGCAAGAATCAGGCGTCCTAGACTCCACGGTGCAAATAAGTGACGGCCACGTTAACAYGACCCTGTACTGAAAACCCACGAACCACTATGCCTACCTTCATGCCTCCAGCTTCCATCCCGGACACACCACACGATCCATCGTCCACAGCCAAGTGCTGAGGTACAACTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAGACGTGTACCTAGAAGTCTCCTGCTGCAAGACAAACCCAAGAAAGAAACCAACAGGACTCCGCTGGCCATCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCACAGGCCTTGGGTGGCAGGCCAGTCCTCGCCCCCAGACAGCCTGCCAACCTGAAGCATATTCTCACCAGTAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGCAACAAACCTTGATGCCAGCTATGCCCACATATCTACACCAGCGACACCATCACAGGATCTAACCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGTCCACCAATGTAATATATGCCATCATGTGCCAGCAATGCCCCTCTGCTATGTACATCGGCCARACTGGACAGTCTCTACGGAAAAGGATAAATGGACACAAATCAGATATTAGGAATGGCAATCNNNNNNNNNNNNNNNNNNNNNNCT\n", "3 0 496 1072\n", "3 835 1072 1072\n", "4 0 1072 1072\n", "AATTCTTTGAGCTGCTGTTGATAAATCAGCCTGGAAGAGGTGGAACGTTGTGAAGATTCCCGGGGTACCCAGCGTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAAATAAGTGACGGCCACGTTAACACGACCCTGTACTGAAAACCCACGAACCACTATGCCTACCTTCATGCCTCCAGCTTCCATCCCGGACACACCACACGATCCATCGTCCACAGCCAAGTGCTGAGGTACAACTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAGACGTGTACCTAGAAGTCTCCTGCTGCAAGACAAACCCAAGAAAGAAACCAACAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAACAAACCTCGATGCCAACTCTGCCCACATATCTACACCAGCGACACCATCACAGGATCTAACCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGTGCCAGCAATGCCCCTCTGCTATGTACATCGGMCAAACTGGMCAGTCTCTAYGGAAAAGGATAAATGGACACAAATCAGATATTAGGAATGGCAATTNNNNNNNNNNNNNNNGAGAACACT\n", "\n", "1 0 227 227\n", "2 0 227 227\n", "3 0 227 227\n", "4 0 227 227\n", "\n", "2 0 220 220\n", "3 0 220 220\n", "4 0 220 220\n", "\n", "1 0 230 230\n", "2 0 230 230\n", "3 0 230 230\n", "4 0 230 230\n", "\n", "1 0 222 222\n", "2 0 222 222\n", "3 0 222 222\n", "4 0 222 222\n", "\n", "3 0 606 606\n", "\n", "4 0 265 265\n", "\n", "1 0 229 229\n", "2 0 229 229\n", "3 0 229 229\n", "4 0 229 229\n", "\n", "1 0 238 238\n", "2 0 238 238\n", "3 0 238 238\n", "4 0 238 238\n", "\n", "1 0 704 704\n", "2 203 704 704\n", "3 0 704 704\n", "4 0 704 704\n", "\n", "1 0 573 573\n", "2 0 573 573\n", "3 0 573 573\n", "4 0 573 573\n", "\n", "1 0 254 254\n", "3 0 254 254\n", "\n", "1 0 227 227\n", "2 0 227 227\n", "3 0 227 227\n", "4 0 227 227\n", "\n", "1 0 220 220\n", "2 0 220 220\n", "\n", "2 0 572 572\n", "\n", "2 0 413 413\n", "\n", "1 0 651 930\n", "2 0 930 930\n", "3 0 651 930\n", "4 0 651 930\n", "\n", "1 0 220 220\n", "2 0 220 220\n", "3 0 220 220\n", "4 0 220 220\n", "\n", "1 0 350 350\n", "3 0 350 350\n", "4 0 209 350\n", "\n", "4 0 542 542\n", "\n", "2 118 1169 1078\n", "3 0 1169 1078\n", "AAATTTGCTTCTCTGTTAACTGGGGAGAATAAACCTTCCTTTCCCCCGCNNTGTGCTCTGCCTTGTCTAGCTAGATTGTCAGCTCTTTGAGACAGGGACTGTCTCTTACAGTGCGTAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGCTGCCTTTAACAATCTCACCTCCTTATATGGCTAAACTATTCTTTTTTTGAACTTGGCTTTGTTTACAGTCCACTTAAAATATTTAGACTCTGGCTGGGTTAGACAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTTATTTACATCCTATAAAATAAAGTGTTATTGGCAATATAGCAGCTTCCTAGCTCCATTTTAACTATTGACTGGGGTAACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAATTGAGTGGTACACAGGAATGGCGGAGCTCAGTCAGACCGCTGGGCCATCTAGCCCAAGGTTCTGCCTTTGCAAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGATTGAGTGCTCCATCCCATCATCCACTCCCAGCTTCTGGAAGTCAGAGGTCTAGA\n", "\n", "1 271 518 518\n", "2 271 518 518\n", "3 271 518 518\n", "4 0 518 518\n", "\n", "1 0 256 256\n", "2 0 256 256\n", "3 0 256 256\n", "4 0 256 256\n", "\n", "4 0 679 679\n", "\n", "1 0 258 258\n", "2 0 258 258\n", "4 0 258 258\n", "\n", "1 152 501 501\n", "2 152 390 501\n", "3 152 390 501\n", "4 0 501 501\n", "\n", "1 0 717 717\n", "2 0 244 717\n", "2 502 717 717\n", "3 0 717 717\n", "4 0 244 717\n", "\n", "1 405 622 809\n", "2 0 622 809\n", "4 405 809 809\n", "\n", "1 387 617 617\n", "2 387 617 617\n", "4 0 617 617\n", "\n", "1 1 233 233\n", "2 1 233 233\n", "3 0 233 233\n", "4 1 233 233\n", "\n", "1 349 605 605\n", "2 349 605 605\n", "3 349 605 605\n", "4 0 605 605\n", "\n", "1 0 498 498\n", "2 0 498 498\n", "3 0 498 498\n", "4 0 498 498\n", "\n", "1 577 1180 1180\n", "2 340 1180 1180\n", "3 571 1180 1180\n", "4 0 1180 1180\n", "CATGAAGGGCTGGTGCAACCATTTAGGCGACCGCCTAGGGCACAAAGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGCAGCAAGTAGGGGGGGGGAGGCGGCACGCGGGGGAGCCCCCCCCCCCAGCTCCCCCCTGCCCCGCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTCTCCCACCTCCCAGGCTTGTGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGGGGGGCGCAAGGTGGAAGTTTCACCTAGGGTGCGAAACATCTTTGCACCGGCCCTGCATCCATGACATGAGAAGTTGTGTCTTTTGGTCCCTGGGTGTCTGCCACATCAACACCACCACCCAATAGCTTTGGGGAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCTGATTGGCAGTGCTTCCTAGTTCCCTGATTGATCCAGATGCTCTATTTAAGCCTGGAGGGGACAGAAGAAAATTTTCTGTACAAGTAGGTGGATCCTCAGCTGGCTGGTTTGACAGCACCTGCTCCTTTGCATTGCTTTGGCTCCTNNNNNNNNNNNNNNNNNNNNNCCCTTAGCTCTGGTCCCAGGCCCCTTACTCCACTTTGCCTACTAAGCTGAAGCACCCGTGCCCTAGCCATGACAGCTGGTAGCTGTGGCAAGATTGAAATGCAATAC\n", "\n", "1 11 248 248\n", "2 11 248 248\n", "3 11 248 248\n", "4 0 248 248\n", "\n", "1 135 510 510\n", "2 135 510 510\n", "4 0 510 510\n", "\n", "1 0 233 233\n", "2 0 233 233\n", "3 0 233 233\n", "4 0 233 233\n", "\n", "1 0 462 462\n", "2 0 220 462\n", "3 0 316 462\n", "\n", "1 0 363 363\n", "2 0 256 363\n", "3 0 256 363\n", "4 0 256 363\n", "\n", "1 110 352 352\n", "2 0 352 352\n", "3 110 352 352\n", "4 110 352 352\n", "\n", "1 0 218 218\n", "4 1 216 218\n", "\n", "1 0 534 908\n", "2 275 908 908\n", "3 275 534 908\n", "4 275 534 908\n", "\n", "1 0 579 676\n", "2 345 579 676\n", "3 204 579 676\n", "4 345 676 676\n", "\n", "1 0 486 486\n", "2 0 486 486\n", "3 0 486 486\n", "4 0 486 486\n", "\n", "1 0 232 368\n", "2 0 232 368\n", "3 0 368 368\n", "4 0 232 368\n", "\n", "1 0 817 817\n", "\n", "1 88 309 309\n", "2 0 309 309\n", "3 88 309 309\n", "4 81 309 309\n", "\n", "2 83 296 296\n", "3 0 296 296\n", "\n", "2 0 256 256\n", "4 0 256 256\n", "\n", "1 0 248 248\n", "2 0 248 248\n", "3 0 248 248\n", "4 0 248 248\n", "\n", "1 0 779 943\n", "2 0 779 943\n", "3 0 943 943\n", "4 0 943 943\n", "\n", "1 0 232 660\n", "2 0 232 660\n", "3 0 660 660\n", "\n", "4 0 249 249\n", "\n", "1 195 444 444\n", "2 195 444 444\n", "3 0 444 444\n", "4 0 444 444\n", "\n", "1 0 821 960\n", "2 0 1059 960\n", "3 0 1148 960\n", "4 0 1059 960\n", "\n", "2 184 711 711\n", "4 0 453 711\n", "\n", "1 0 755 755\n", "\n", "1 0 249 490\n", "2 0 249 490\n", "3 0 490 490\n", "4 0 249 490\n", "\n", "1 0 612 612\n", "3 0 612 612\n", "4 0 612 612\n", "\n", "4 0 259 259\n", "\n", "1 0 419 419\n", "2 0 246 419\n", "3 0 246 419\n", "4 0 419 419\n", "\n", "1 0 232 515\n", "2 0 470 515\n", "3 0 232 515\n", "4 0 515 515\n", "\n", "1 0 725 725\n", "2 220 438 725\n", "3 0 438 725\n", "4 104 438 725\n", "\n", "1 0 248 248\n", "2 0 248 248\n", "3 0 248 248\n", "4 0 248 248\n", "\n", "2 0 465 465\n", "4 0 465 465\n", "\n", "3 0 429 429\n", "\n", "1 0 844 844\n", "2 0 224 844\n", "2 611 844 844\n", "3 0 224 844\n", "3 611 844 844\n", "4 0 224 844\n", "4 449 844 844\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1 201 441 441\n", "2 201 441 441\n", "3 201 441 441\n", "4 0 441 441\n", "\n", "1 121 353 353\n", "2 121 353 353\n", "3 121 353 353\n", "4 0 353 353\n", "\n", "1 0 253 540\n", "2 0 253 540\n", "3 0 540 540\n", "4 0 253 540\n", "\n", "1 644 1720 1604\n", "2 644 1720 1604\n", "3 133 1162 1604\n", "CATGATTGCATCAAAAACCTACCACAAGAGTTCAAAGTTAGGGGAGANNNNNNNNNATACTCCTTCTCCAGGGAGAGCCAGTAAATGGCAGTTTAACTGCCTTATTAACAGCAACCCTTGTATCTGCCAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGTAGTTACCTTGAAGATAGGGTCTAGGATTAGCTGGCAGAAAGTCCTGGGCAGTTTCCTTCCATCAGNNNNNNNNNNNNNNNNNNNNNNNNTGCCAGAAGCAGGATCAAAATACCTAAAAAAAAAATCCAGTCAGAGGGGATTGTGGTTTCGCCATCTTGTTTAATTAGCACTGCTTATTCTGCTTGCCTTTCAAGATGGAGGGAAGGGTTATCAAGTCAAAGACCCGTTTTGAGATGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTATCGTGTATTCCTGATCCCATCCCCTAATATTTGTCTGATTAAATCTAGATTTAAAGTCCCTCCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAGGATTCCTTCCTTTGGGTGCTACCATACTGTTGCTTCCTTACTCCCAGCCCAGGAGAGCTCAAGTATTCCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAA\n", "4 0 1720 1604\n", "CATGTCTCTTCCCAAGGCAATGCCTATCAGTTCTTACCTTCAGTAGGGGTTTGCCCTCCTTGTCTTTATCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGATTGCATCAAAAACCTACCACAAGAGTTCAAAGTTAGGGGAGAAAGCAGCTCATACTCCTTCTCCAGGGAGAGCCAGTAAATGGCAGTTTAACTGCCTTATTAACAGCAACCCTTGTATCTGCCAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGTAGTTACCTTGAAGATAGGGTCTAGGATTAGCTGGCAGAAAGTCCTGGGCAGTTTCCTTCCATCAGNNNNNNNNNNNNNNNNNNNNNNNTTTCCAGAAGCAGGATCAAAATACCTNAAAAAAAAATCCAGTCAGAGGGGATTGTGGTTTCGCCATCTTGTTTAATTAGCACTGCTTATTCTGCTTGCCTTTCAAGATGGAGGGAAGGGTTATCAAGTCAAAGACCCGTTTTGAGATGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTATCGTGTATTCCTGATCCCATCCCCTAATATTTGTCTGATTAAATCTAGATTTAAAGTCCCTCCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAGGATTCCTTCCTTTGGGTGCTACCATACTGTTGCTTCCTTACTCCCAGCCCA\n", "\n", "1 105 331 331\n", "2 105 329 331\n", "3 0 329 331\n", "4 105 329 331\n", "\n", "1 0 252 593\n", "2 0 252 593\n", "3 0 593 593\n", "4 0 252 593\n", "\n", "3 0 473 473\n", "\n", "2 0 231 231\n", "3 0 231 231\n", "\n", "1 0 211 211\n", "2 0 211 211\n", "\n", "2 0 831 831\n", "4 568 831 831\n", "\n", "2 201 427 427\n", "3 201 427 427\n", "4 0 427 427\n", "\n", "1 1 247 247\n", "2 0 247 247\n", "3 1 247 247\n", "4 1 247 247\n", "\n", "1 672 889 889\n", "2 315 889 889\n", "4 0 447 889\n", "\n", "1 830 1085 1085\n", "4 0 1085 1085\n", "AATTTTATACCCGGGCCACAAAGAGGCGTCATTTTGCGGGGTCAGCTACAGCCCAGCAATATCATCTGCAGTGATACTTCCAGCTGTTCTCCTGCCTTGCTAGGCTGCAAGACTTCTCTGTGATTCATGNNNNNNNNNNNNNNNNNNNNNNNNNNAATTTTAACCAACCAACCAGCCCATCCTCCTCCTCCTCCATCTTTGGGCAGGCAGCGTCTTTGACTTGCATATCGAGAGCAGCTTACCAGTTGTGACCATCTCAGTTCGGGGACCGGCTGTCTCACTTCTGCAGTGCATGGGAAGCAATAACTACGGACAGATGGGTCCTAACCACTGTGGGGTCAGGTTATGTCATCCAACTTCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTATCAGGGAAAGGGATTCTGCTCCTGGCACTTCCTGATCCCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGTGGTGGTTTTTCCAGACCACAGCAAGTTTCCTAATCATCAGTCAACATTATCAATACACTGGTCTCCNNNNNNNTCTGTCATCAGCCCCACGTGTCTTTAGTGAATGTCTGATGATGGTTGTAGTGACTTATCCCCAATGGAGGGAAATTCATGTCCTCCCATACCTAGACAACTGGCTAGTGAGGGGCAGATCCAAAAAATAAGTCCTCAGTCCTCTCCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n", "\n", "1 0 411 411\n", "2 191 411 411\n", "3 0 411 411\n", "4 191 411 411\n", "\n", "1 0 503 503\n", "2 0 503 503\n", "3 0 503 503\n", "4 0 503 503\n", "\n", "1 114 352 427\n", "2 114 352 427\n", "3 0 427 427\n", "4 114 427 427\n", "\n", "1 0 396 541\n", "2 0 541 541\n", "3 146 477 541\n", "4 146 396 541\n", "\n", "1 0 730 730\n", "2 84 545 730\n", "3 0 631 730\n", "4 0 545 730\n", "\n", "1 0 256 257\n", "2 0 256 257\n", "3 0 257 257\n", "4 0 256 257\n", "\n", "1 0 325 325\n", "2 72 325 325\n", "3 72 325 325\n", "4 2 325 325\n", "\n", "2 0 257 257\n", "4 0 257 257\n", "\n", "1 0 566 566\n", "2 0 566 566\n", "3 0 566 566\n", "4 0 566 566\n", "\n", "1 0 427 661\n", "2 195 661 661\n", "3 195 661 661\n", "4 195 427 661\n", "\n", "1 0 216 414\n", "2 0 414 414\n", "3 0 414 414\n", "\n", "1 0 829 991\n", "2 409 829 991\n", "3 594 829 991\n", "4 594 991 991\n", "\n", "1 0 263 960\n", "1 612 828 960\n", "2 0 989 960\n", "3 0 263 960\n", "3 612 828 960\n", "4 0 263 960\n", "\n", "1 177 417 624\n", "2 0 417 624\n", "3 0 624 624\n", "4 177 417 624\n", "\n", "1 305 1351 1351\n", "CATGGGCTCCAGTCCAAGCCCAAATATCTCCACCACACGTTTACACCGCCGCAGCACGAGCCCAAGTTAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAACTGTTCTGACACACAACTGCGTGCAAAGCTCTTTGAGATACTCAGGCCAAAGGCTCTATTAAGTACAAAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGCTACCCAAACTGAACTGACTCTCTGGTATCAATGGCTCTGTTTGAACGTAATAAGATGTTATAGGGTTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTGGCACCAGGAAAATGTAAATACATTTCACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGGGCAGGGGGGGCTGCAAGGTTATTGTAGTGAGGGCACAGTATTGTCACCCTTACTTTTGTGCTGTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n", "2 305 562 1351\n", "2 1099 1351 1351\n", "3 0 1351 1351\n", "AATTTTAGCATTCACATTGTTTTTTCCCTTTTACTGCTGTGTCAAAGATGAAAACAGCTGAAGGATGACTTAAAGATCCAGCTGAATGGTATCATTTACCGCTGTACTCTGGGACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGGCTCCAGTCCAAGCCCAAATATCTCCACCACACGTTTACACCGCCGCAGCACGAGCCCAAGTTAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAACTGTTCTGACACACAACTGCGTGCAAAGCTCTTTGAGATACTCAGGCCAAAGGCTCTATTAANTACAAAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGCTACCCAAACTGAACTGACTCTCTGGTATCAATGGCTCTGTTTGAACGTAATAAGATGTTATAGGGTTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCACCAGCAAAGGGATTCATTACTTGTCCTTCAGTTTCTGGCACCAGGAAAATGTAAATACATTTCACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n", "4 305 1351 1351\n", "CATGGGCTCCAGTCCAAGCCCAAATATCTCCACCACACGTTTACACCRCCGCAGCACGAGCCCAAGTTAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAACTGTTCTGACACACAACTGCGTGCAAAGCTCTTTGAGATACTCAGGCCAAAGGCTCTATTAARTACAAAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGCTACCCAAACTGAACTGACTCTCTGGTATCAATGGCTCTGTTTGAACGTAATAAGATGTTATAGGGTTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCACCAGCAAAGGGATTCATTACTTGTCCTTCAGTTTCTGGCACCAGGAAAATGTAAATACATTTCACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGGGCAGGGGGGGCTGCAAGGTTATTGTAGTGAGGGCACAGTATTGTCACCCTTACTTTTGTGCTGTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n", "\n", "1 0 464 464\n", "2 0 464 464\n", "3 0 464 464\n", "4 0 464 464\n", "\n", "1 236 570 960\n", "2 0 570 960\n", "3 339 570 960\n", "4 0 1241 960\n", "\n", "1 447 873 873\n", "2 247 873 873\n", "3 0 873 873\n", "4 651 873 873\n", "\n", "1 0 412 412\n", "2 0 235 412\n", "3 0 235 412\n", "4 0 235 412\n", "\n", "1 0 810 810\n", "\n", "1 0 1239 960\n", "4 0 589 960\n", "\n", "2 0 247 839\n", "3 0 839 839\n", "4 0 247 839\n", "\n", "1 359 577 577\n", "2 0 577 577\n", "4 14 577 577\n", "\n", "1 0 230 841\n", "1 620 841 841\n", "2 0 417 841\n", "2 620 841 841\n", "3 0 841 841\n", "4 0 841 841\n", "\n", "1 0 237 237\n", "2 0 237 237\n", "3 0 237 237\n", "4 0 237 237\n", "\n", "1 0 498 498\n", "2 0 234 498\n", "3 0 234 498\n", "4 0 234 498\n", "\n", "2 0 300 300\n", "\n", "1 0 240 385\n", "2 0 240 385\n", "3 0 385 385\n", "4 0 240 385\n", "\n", "1 0 523 561\n", "2 0 249 561\n", "3 0 561 561\n", "4 0 249 561\n", "\n", "1 191 420 420\n", "2 183 420 420\n", "4 0 420 420\n", "\n", "2 0 259 259\n", "3 0 259 259\n", "4 0 259 259\n", "\n", "1 0 253 253\n", "2 0 253 253\n", "3 0 253 253\n", "4 0 253 253\n", "\n", "1 0 246 246\n", "2 0 246 246\n", "3 0 246 246\n", "4 0 246 246\n", "\n", "1 224 568 568\n", "2 342 568 568\n", "3 0 568 568\n", "4 224 568 568\n", "\n", "1 0 226 227\n", "2 0 226 227\n", "3 0 226 227\n", "4 0 227 227\n", "\n", "1 49 286 286\n", "2 49 286 286\n", "3 0 286 286\n", "4 49 286 286\n", "\n", "1 0 228 228\n", "2 0 228 228\n", "3 0 228 228\n", "4 0 228 228\n", "\n", "1 0 553 554\n", "2 0 554 554\n", "3 0 241 554\n", "4 0 554 554\n", "\n", "1 187 715 715\n", "2 0 715 715\n", "3 187 715 715\n", "4 187 715 715\n", "\n", "1 0 236 236\n", "2 0 236 236\n", "3 0 236 236\n", "4 0 236 236\n", "\n", "1 0 473 473\n", "\n", "1 299 766 766\n", "2 0 766 766\n", "3 0 766 766\n", "4 0 766 766\n", "\n", "1 0 247 247\n", "3 0 247 247\n", "4 123 247 247\n", "\n" ] }, { "ename": "ValueError", "evalue": "cannot assign slice from input of different size", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mremote_build_ref_clusters\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m\u001b[0m in \u001b[0;36mremote_build_ref_clusters\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0mclust\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0mavars\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrefvars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muint8\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPSEUDO_REF\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 69\u001b[0m \u001b[0mdat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mb\"\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mavars\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"S1\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0msnpstring\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"*\"\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m\" \"\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mavars\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: cannot assign slice from input of different size" ] } ], "source": [ "remote_build_ref_clusters(self)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/home/deren/local/src/ipyrad/bin/bedtools-linux-x86_64 bamtobed -i /home/deren/local/src/ipyrad/tests/6-tortas/test_across/test.cat.sorted.bam'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"use bedtools to pull in clusters and match catgs with alignments\"\n", "cmd1 = [\n", " ipyrad.bins.bedtools,\n", " \"bamtobed\",\n", " \"-i\", \n", " os.path.join(\n", " data.dirs.across,\n", " \"{}.cat.sorted.bam\".format(self.data.name)\n", " )\n", "]\n", "\" \".join(cmd1)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/home/deren/local/src/ipyrad/bin/bedtools-linux-x86_64 merge -d 0 -i -'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cmd2 = [\n", " ipyrad.bins.bedtools, \n", " \"merge\", \n", " \"-d\", \"0\",\n", " \"-i\", \"-\",\n", "]\n", "\n", "\" \".join(cmd2)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "cannot copy sequence with size 288 to array axis with dimension 371", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremote_build_ref_clusters\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/local/src/ipyrad/ipyrad/assemble/cluster_across.py\u001b[0m in \u001b[0;36mremote_build_ref_clusters\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 594\u001b[0m \u001b[0marr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 595\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 596\u001b[0;31m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 597\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0;31m# get consens seq and variant site index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: cannot copy sequence with size 288 to array axis with dimension 371" ] } ], "source": [ "self.remote_build_ref_clusters()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: test\n", "[####################] 100% 0:00:00 | concatenating bams | s6 |\n", "[####################] 100% 0:00:00 | building clusters | s6 |\n", "\n", " Encountered an unexpected error (see ./ipyrad_log.txt)\n", " Error message is below -------------------------------\n", "cannot copy sequence with size 288 to array axis with dimension 371\n" ] } ], "source": [ "test.run(\"6\", ipyclient=ipyclient)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "test2 = test.branch(\"test2\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.consens_se import *" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "self = Step5(test2, True, ipyclient)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:16 | calculating depths | s5 |\n", "[####################] 100% 0:00:37 | chunking clusters | s5 |\n" ] } ], "source": [ "self.remote_calculate_depths()\n", "self.remote_make_chunks()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:14:53 | consens calling | s5 |\n" ] } ], "source": [ "statsdicts = self.remote_process_chunks()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "sample = self.samples[0]\n", "data = self.data" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# write sequences to SAM file\n", "combs1 = glob.glob(os.path.join(\n", " data.tmpdir,\n", " \"{}_tmpcons.*\".format(sample.name)))\n", "combs1.sort(key=lambda x: int(x.split(\".\")[-1]))\n", "\n", "# open sam handle for writing to bam\n", "samfile = sample.files.consens.replace(\".bam\", \".sam\") \n", "with open(samfile, 'w') as outf:\n", "\n", " # parse fai file for writing headers\n", " fai = \"{}.fai\".format(data.paramsdict[\"reference_sequence\"])\n", " fad = pd.read_csv(fai, sep=\"\\t\", names=[\"SN\", \"LN\", \"POS\", \"N1\", \"N2\"])\n", " headers = [\"@HD\\tVN:1.0\\tSO:coordinate\"]\n", " headers += [\n", " \"@SQ\\tSN:{}\\tLN:{}\".format(i, j)\n", " for (i, j) in zip(fad[\"SN\"], fad[\"LN\"])\n", " ]\n", " outf.write(\"\\n\".join(headers) + \"\\n\")\n", "\n", " # write to file with sample names imputed to line up with catg array\n", " counter = 0\n", " for fname in combs1:\n", " with open(fname) as infile:\n", " # impute catg ordered seqnames \n", " data = infile.readlines()\n", " fdata = []\n", " for line in data:\n", " name, chrom, rest = line.rsplit(\":\", 2)\n", " fdata.append(\n", " \"{}_{}:{}:{}\".format(name, counter, chrom, rest)\n", " )\n", " counter += 1\n", " outf.write(\"\".join(fdata) + \"\\n\")\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "ename": "IPyradError", "evalue": "error in samtools: b'[E::sam_parse1] CIGAR and query sequence are of different length\\n[W::sam_read1] parse error at line 10690\\n[main_samview] truncated file.\\n'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIPyradError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mcomm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommunicate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mproc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mIPyradError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"error in samtools: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcomm\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mIPyradError\u001b[0m: error in samtools: b'[E::sam_parse1] CIGAR and query sequence are of different length\\n[W::sam_read1] parse error at line 10690\\n[main_samview] truncated file.\\n'" ] } ], "source": [ "cmd = [ip.bins.samtools, \"view\", \"-Sb\", samfile]\n", "with open(sample.files.consens, 'w') as outbam:\n", " proc = sps.Popen(cmd, stderr=sps.PIPE, stdout=outbam)\n", " comm = proc.communicate()[1]\n", "if proc.returncode:\n", " raise IPyradError(\"error in samtools: {}\".format(comm))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-1572" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "368852-370424 " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:01:20 | indexing alleles | s5 |\n" ] }, { "ename": "IPyradError", "evalue": "IPyradError(error in samtools: b'[E::sam_parse1] CIGAR and query sequence are of different length\\n[W::sam_read1] parse error at line 10690\\n[main_samview] truncated file.\\n')", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIPyradError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremote_concatenate_chunks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_store\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstatsdicts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/local/src/ipyrad/ipyrad/assemble/consens_se.py\u001b[0m in \u001b[0;36mremote_concatenate_chunks\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 315\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msuccessful\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[0mip\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"error in step 5: %s\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 317\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mIPyradError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 318\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 319\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIPyradError\u001b[0m: IPyradError(error in samtools: b'[E::sam_parse1] CIGAR and query sequence are of different length\\n[W::sam_read1] parse error at line 10690\\n[main_samview] truncated file.\\n')" ] } ], "source": [ "self.remote_concatenate_chunks()\n", "self.data_store(statsdicts)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "shutil.rmtree(self.data.tmpdir)\n", "self.data.save()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.cluster_across import *\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "self = Step6(test, True, ipyclient)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:56 | concatenating bams | s6 |\n" ] } ], "source": [ "self.remote_concat_bams()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:13 | building clusters | s6 |\n" ] } ], "source": [ "self.remote_build_ref_regions()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Contig0', 3480, 3747),\n", " ('Contig0', 24390, 24761),\n", " ('Contig0', 24903, 25488),\n", " ('Contig0', 30442, 30712),\n", " ('Contig0', 33784, 34013),\n", " ('Contig0', 37975, 38628),\n", " ('Contig0', 40387, 40730),\n", " ('Contig0', 41782, 42162),\n", " ('Contig0', 44567, 44783),\n", " ('Contig0', 47550, 47763)]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "self.regions[:10]" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AGO09concat_0:1:3481-3748\n", "3481 3748 0 267 267 267\n", "\n", "PZ03concat_0:1:3481-3748\n", "3481 3748 0 267 267 267\n", "\n", "LP08concat_0:1:24391-24679\n", "24391 24679 0 288 288 371\n", "\n", "PBR_B125concat_0:1:24391-24762\n", "24391 24762 0 371 371 371\n", "\n", "AGO09concat_1:1:24904-25489\n", "24904 25489 0 585 585 585\n", "\n", "PBR_B125concat_1:1:25020-25249\n", "25020 25249 116 345 229 585\n", "\n", "PZ03concat_1:1:30443-30713\n", "30443 30713 0 270 270 270\n", "\n", "AGO09concat_2:1:33785-34014\n", "33785 34014 0 229 229 229\n", "\n", "LP08concat_1:1:33785-34014\n", "33785 34014 0 229 229 229\n", "\n", "PBR_B125concat_2:1:33785-34014\n", "33785 34014 0 229 229 229\n", "\n", "PZ03concat_2:1:33785-34014\n", "33785 34014 0 229 229 229\n", "\n", "PZ03concat_3:1:37976-38629\n", "37976 38629 0 653 653 653\n", "\n", "AGO09concat_3:1:40396-40633\n", "40396 40633 8 245 237 343\n", "\n", "LP08concat_2:1:40388-40633\n", "40388 40633 0 245 245 343\n", "\n", "PBR_B125concat_3:1:40396-40633\n", "40396 40633 8 245 237 343\n", "\n", "PZ03concat_4:1:40388-40731\n", "40388 40731 0 343 343 343\n", "\n", "AGO09concat_4:1:41783-42163\n", "41783 42163 0 380 380 380\n", "\n", "LP08concat_3:1:41920-42163\n", "41920 42163 137 380 243 380\n", "\n", "PBR_B125concat_4:1:41920-42163\n", "41920 42163 137 380 243 380\n", "\n", "PZ03concat_5:1:41920-42163\n", "41920 42163 137 380 243 380\n", "\n", "AGO09concat_5:1:44568-44784\n", "44568 44784 0 216 216 216\n", "\n", "LP08concat_4:1:44568-44784\n", "44568 44784 0 216 216 216\n", "\n", "PZ03concat_6:1:44568-44784\n", "44568 44784 0 216 216 216\n", "\n", "PBR_B125concat_5:1:47551-47764\n", "47551 47764 0 213 213 213\n", "\n", "AGO09concat_6:1:47992-48229\n", "47992 48229 0 237 237 237\n", "\n", "PBR_B125concat_6:1:49479-50005\n", "49479 50005 0 526 526 526\n", "\n", "AGO09concat_7:1:56089-56333\n", "56089 56333 111 355 244 355\n", "\n", "LP08concat_5:1:56089-56333\n", "56089 56333 111 355 244 355\n", "\n", "PBR_B125concat_7:1:56089-56333\n", "56089 56333 111 355 244 355\n", "\n", "PZ03concat_7:1:55978-56333\n", "55978 56333 0 355 355 355\n", "\n", "AGO09concat_8:1:62284-62501\n", "62284 62501 28 245 217 457\n", "\n", "LP08concat_6:1:62284-62501\n", "62284 62501 28 245 217 457\n", "\n", "PBR_B125concat_8:1:62284-62501\n", "62284 62501 28 245 217 457\n", "\n", "PZ03concat_8:1:62256-62713\n", "62256 62713 0 457 457 457\n", "\n", "PZ03concat_9:1:64734-65000\n", "64734 65000 0 266 266 266\n", "\n", "AGO09concat_9:1:68597-68887\n", "68597 68887 829 1119 290 1301\n", "\n", "LP08concat_7:1:68597-69007\n", "68597 69007 829 1239 410 1301\n", "\n", "PBR_B125concat_9:1:68597-69069\n", "68597 69069 829 1301 472 1301\n", "\n", "PZ03concat_10:1:67768-68840\n", "67768 68840 0 1072 960 1301\n" ] }, { "ename": "ValueError", "evalue": "cannot copy sequence with size 960 to array axis with dimension 1072", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrstart\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrstop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrefs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 52\u001b[0;31m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0msidx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstop\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 53\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: cannot copy sequence with size 960 to array axis with dimension 1072" ] } ], "source": [ "# access reads from bam file using pysam\n", "bamfile = AlignmentFile(\n", " os.path.join(\n", " self.data.dirs.across,\n", " \"cat.sorted.bam\"),\n", " 'rb')\n", "\n", "# catcons output file for raw clusters and db samples\n", "outfile = gzip.open(\n", " os.path.join(\n", " self.data.dirs.across,\n", " \"{}.catcons.gz\".format(self.data.name)),\n", " 'wb')\n", "\n", "# write header line to catcons with sample names\n", "snames = ['ref'] + sorted([i.name for i in self.samples])\n", "outfile.write(\n", " b\" \".join([i.encode() for i in snames]) + b\"\\n\")\n", "\n", "for region in self.regions[:100]:\n", " reads = bamfile.fetch(*region)\n", "\n", " # get reference sequence\n", " refn, refs = get_ref_region(\n", " self.data.paramsdict[\"reference_sequence\"],\n", " region[0], region[1] + 1, region[2] + 1\n", " )\n", "\n", " # build cluster dict for sorting \n", " rdict = {}\n", " for read in reads:\n", " rdict[read.qname] = read.seq \n", " keys = sorted(rdict.keys(), key=lambda x: x.rsplit(\":\", 2)[0])\n", "\n", " # make an empty array\n", " arr = np.zeros((len(snames), len(refs)), dtype=bytes)\n", "\n", " # fill the array\n", " arr[0] = list(refs)\n", " for idx, key in enumerate(keys):\n", " # get start and stop from this seq\n", " sname = key.rsplit(\"_\", 1)[0]\n", " rstart, rstop = key.rsplit(\":\", 2)[-1].split(\"-\")\n", " sidx = snames.index(sname)\n", "\n", " # get start and stop relative to reference region\n", " start = int(rstart) - int(region[1]) - 1\n", " stop = start + int(rstop) - int(rstart)\n", " \n", " print(key)\n", " print(rstart, rstop, start, stop, len(rdict[key]), len(refs))\n", " arr[sidx, start: stop] = list(rdict[key])\n", " print(\"\")\n", "\n", " arr[arr == b\"\"] = b\"N\"\n", " clusts.append(b\"\\n\".join([b\"\".join(line) for line in arr]))" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "# avars = refvars(arr.view(np.uint8), PSEUDO_REF)\n", "# dat = b\"\".join(avars.view(\"S1\")[:, 0]).decode()\n", "# snpstring = \"\".join([\"*\" if i else \" \" for i in avars[:, 1]])\n", "# cname = \"ref_{}:{}-{}\\n{}\".format(*region, dat)\n", "# cname" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-1072" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "67768 -68840" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "b'AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTGTATTCATTAAATGGAGCATCTCTTGTCACTGTCCAGCAATAGTCTGCAAGCATTGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCGCCGTGCTCGTCGCTCACTGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATGTTGCAACCAAGGCTTTTGTATGCCTTGAGGAGGTTTTCCACCAACAACCTGTAGTTGTCTGCCTTGTTGTTTCCGAGAAAATT\\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTSAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTTTATTCATTAAATGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGATAGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGGAATCGCTCGCCATGNNNNNCGCTCACTGCTCCGCAGTTCAGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\\nAATTTATTATGAAACAAAAGGCNAAAAACNATTATGTACATANTTTAGTCCTATTCAGTGTCTACTCAGCNCTTCTTGGCTTGTCTCTTGTATTCATTAAATGGAGGATCTCTTGTCNNNNNNNNNNNNNNNNNNNNNNNNATTGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCGCCNNNNNNNNCGCTCACTGCTCNGCAGTTCNGTGGAAAAAAATCTNNATGAGAGTGCAAAAAATGTATCTTTAGTGACATGTTGCAACCAAGGCTTTTGTATGCCTTGAGAAGGTTTTCCACCAACAACCTGTAGTTGTCTGCCTTGTTGTTTCCGAGAAAATT\\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clusts.append(b\"\\n\".join([b\"\".join(line) for line in arr]))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "outfile.close()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: test\n", "[####################] 100% 0:00:56 | concatenating bams | s6 |\n", "\n", " Encountered an unexpected error (see ./ipyrad_log.txt)\n", " Error message is below -------------------------------\n", "cannot copy sequence with size 288 to array axis with dimension 371\n" ] } ], "source": [ "test.run('6')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'AGO02concat'" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## do cigar building again, this time with orientations\n", "data = self.data\n", "sample = list(data.samples.values())[0]\n", "sample.name" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# get all regions with reads. Generator to yield (str, int, int)\n", "fullregions = bedtools_merge(data, sample).strip().split(\"\\n\")\n", "regions = (i.split(\"\\t\") for i in fullregions)\n", "regions = [(i, int(j), int(k)) for (i, j, k) in regions]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "regs = regions[:20]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "True\tFalse\tCATGAG\tCTACAG\tContig0\t3481\t3748\n", "True\tFalse\tCATGAG\tCTACAG\tContig0\t3481\t3748\n", "False\tTrue\tTTCCAG\tAGAATT\tContig0\t3481\t3748\n", "False\tTrue\tTTCCAG\tAGAATT\tContig0\t3481\t3748\n", "True\tFalse\tCATGGT\tGGACAC\tContig0\t4940\t5088\n", "False\tTrue\tGAGGGA\tGCAATT\tContig0\t4940\t5088\n", "False\tFalse\tAATTGG\tGGAGTT\tContig0\t5481\t5571\n", "True\tTrue\tCATTTT\tGGCATG\tContig0\t5481\t5571\n", "False\tFalse\tAATTGC\tGGGATT\tContig0\t6267\t6397\n", "True\tTrue\tATCCGG\tCCCATG\tContig0\t6267\t6397\n", "True\tFalse\tCATGAT\tTGTGAA\tContig0\t14907\t15026\n", "False\tTrue\tCGCCCC\tAGAATT\tContig0\t14907\t15026\n", "False\tFalse\tAATTCT\tCTACAG\tContig0\t16737\t16933\n", "True\tTrue\tGCAGGA\tGGCATG\tContig0\t16737\t16933\n", "True\tFalse\tCATGAA\tCAGGCA\tContig0\t19897\t20031\n", "False\tTrue\tCTGAGT\tTAAATT\tContig0\t19897\t20031\n", "True\tFalse\tCATGGA\tTGTTGC\tContig0\t23715\t23823\n", "False\tTrue\tCCCTTG\tCAAATT\tContig0\t23715\t23823\n", "False\tFalse\tAATTTA\tACTTCT\tContig0\t24391\t24679\n", "False\tFalse\tAATTAT\tAATGGA\tContig0\t24391\t24679\n", "False\tFalse\tAATTTA\tCTTGTC\tContig0\t24391\t24679\n", "True\tTrue\tATTGAT\tGCCATG\tContig0\t24391\t24679\n", "True\tTrue\tCGCTCA\tGACATG\tContig0\t24391\t24679\n", "True\tTrue\tCGCTCA\tGACATG\tContig0\t24391\t24679\n", "False\tFalse\tAATTGA\tAGTCTC\tContig0\t25290\t25447\n", "False\tFalse\tAATTGA\tAGTCTC\tContig0\t25290\t25447\n", "True\tTrue\tGATCGT\tTCCATG\tContig0\t25290\t25447\n", "True\tTrue\tGATTGT\tTCCATG\tContig0\t25290\t25447\n", "False\tFalse\tAATTTA\tTCTTAA\tContig0\t26192\t26308\n", "True\tTrue\tTATGTT\tCACATG\tContig0\t26192\t26308\n", "True\tFalse\tCATGTT\tCATAGC\tContig0\t28528\t28669\n", "False\tTrue\tCATAGC\tAAAATT\tContig0\t28528\t28669\n", "True\tFalse\tCATGCC\tGTTTTA\tContig0\t29084\t29217\n", "False\tTrue\tCGATCT\tGCAATT\tContig0\t29084\t29217\n", "True\tFalse\tCATGAG\tAAAAAT\tContig0\t29454\t29541\n", "False\tTrue\tGGGGTT\tTCAATT\tContig0\t29454\t29541\n", "False\tFalse\tAATTCC\tCTTCCT\tContig0\t30443\t30713\n", "True\tTrue\tTGGTAG\tCACATG\tContig0\t30443\t30713\n", "False\tFalse\tAATTAG\tATTAAT\tContig0\t33785\t34014\n", "False\tFalse\tAATTAG\tATTAAT\tContig0\t33785\t34014\n", "True\tTrue\tATAGGC\tTGCATG\tContig0\t33785\t34014\n", "True\tTrue\tATAGGC\tTGCATG\tContig0\t33785\t34014\n", "False\tFalse\tAATTTG\tTTAGCC\tContig0\t34452\t34659\n", "True\tTrue\tGGCAGC\tCTCATG\tContig0\t34452\t34659\n", "False\tFalse\tAATTTC\tCGTAGA\tContig0\t34452\t34659\n", "True\tTrue\tTCTAAT\tAACATG\tContig0\t34452\t34659\n", "False\tFalse\tAATTAA\tTTTCTT\tContig0\t37976\t38247\n", "False\tFalse\tAATTAA\tTTTCTT\tContig0\t37976\t38247\n", "True\tTrue\tGAGGCT\tCCCATG\tContig0\t37976\t38247\n", "True\tTrue\tGAGGCT\tCCCATG\tContig0\t37976\t38247\n", "True\tFalse\tCATGGA\tCATTGC\tContig0\t38736\t38961\n", "False\tTrue\tGGTCGG\tGCAATT\tContig0\t38736\t38961\n", "True\tFalse\tCATGCA\tGTGGTA\tContig0\t40396\t40633\n", "True\tFalse\tCATGCA\tGTGGTA\tContig0\t40396\t40633\n", "True\tFalse\tCATACA\tGTGGCA\tContig0\t40396\t40633\n", "False\tTrue\tGCTTGT\tCAAATT\tContig0\t40396\t40633\n", "False\tTrue\tGCTTGT\tCAAATT\tContig0\t40396\t40633\n", "False\tTrue\tGCTTGT\tCAAATT\tContig0\t40396\t40633\n" ] } ], "source": [ "# access reads from bam file using pysam\n", "bamfile = AlignmentFile(\n", " os.path.join(\n", " data.dirs.refmapping,\n", " \"{}-mapped-sorted.bam\".format(sample.name)),\n", " 'rb')\n", "\n", "# iterate over all regions\n", "opath = os.path.join(\n", " data.dirs.clusts, \"{}.alt.clustS.gz\".format(sample.name))\n", "out = gzip.open(opath, 'wt')\n", "idx = 0\n", "clusters = []\n", "for reg in regs:\n", " # uncomment and compare against ref sequence when testing\n", " ref = get_ref_region(data.paramsdict[\"reference_sequence\"], *reg)\n", " reads = bamfile.fetch(*reg)\n", "\n", " # store reads in a dict\n", " rdict = {}\n", "\n", " # paired-end data cluster building\n", " if \"pair\" in data.paramsdict[\"datatype\"]:\n", "\n", " # match paired reads together in a dictionary\n", " for read in reads:\n", " print(\"\\t\".join(\n", " str(i) for i in \n", " [read.is_read1, read.is_reverse,\n", " read.seq[:6], read.seq[-6:], \n", " *reg\n", " ]))\n", " if read.qname not in rdict:\n", " rdict[read.qname] = [read, None]\n", " else:\n", " rdict[read.qname][1] = read\n", "\n", " # sort keys by derep number\n", " keys = sorted(\n", " rdict.keys(),\n", " key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n", "\n", " # build the cluster based on map positions, orientation, cigar\n", " clust = []\n", " clust.append(\"{}\\n{}\".format('ref', ref[1]))\n", " for key in keys:\n", " r1, r2 = rdict[key]\n", " if r1 and r2:\n", "\n", " #lref = len(ref[1])\n", " aref = np.array(list(ref[1]))\n", " lref = reg[2] - reg[1]\n", " arr1 = np.zeros(lref, dtype=\"U1\")\n", " arr2 = np.zeros(lref, dtype=\"U1\")\n", " arr1.fill(\"-\")\n", " arr2.fill(\"-\")\n", "\n", " # how far ahead of the start does this read begin\n", " seq = cigared(r1.seq, r1.cigar)\n", " start = r1.reference_start - reg[1] \n", " arr1[start:start + len(seq)] = list(seq)\n", "\n", " seq = cigared(r2.seq, r2.cigar)\n", " start = r2.reference_start - reg[1] \n", " arr2[start:start + len(seq)] = list(seq)\n", "\n", " arr3 = join_arrays(arr1, arr2)\n", " pairseq = \"\".join(arr3)\n", "\n", " ori = \"+\"\n", " if r1.is_reverse:\n", " ori = \"-\"\n", " derep = r1.qname.split(\"=\")[-1]\n", " rname = \"{}:{}-{};size={};{}\".format(*reg, derep, ori)\n", " clust.append(\"{}\\n{}\".format(rname, pairseq))\n", "\n", " # single-end data cluster building\n", " else: \n", " for read in reads:\n", " rdict[read.qname] = read\n", "\n", " # sort keys by derep number\n", " keys = sorted(\n", " rdict.keys(),\n", " key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n", "\n", " # build the cluster based on map positions, orientation, cigar\n", " clust = []\n", " for key in keys:\n", " r1 = rdict[key]\n", "\n", " #aref = np.array(list(ref[1]))\n", " lref = reg[2] - reg[1]\n", " arr1 = np.zeros(lref, dtype=\"U1\")\n", " arr1.fill(\"-\")\n", "\n", " # how far ahead of the start does this read begin\n", " seq = cigared(r1.seq, r1.cigar)\n", " start = r1.reference_start - reg[1] \n", " arr1[start:start + len(seq)] = list(seq)\n", " aseq = \"\".join(arr1)\n", "\n", " ori = \"+\"\n", " if r1.is_reverse:\n", " ori = \"-\"\n", " derep = r1.qname.split(\"=\")[-1]\n", " rname = \"{}:{}-{};size={};{}\".format(*reg, derep, ori)\n", " clust.append(\"{}\\n{}\".format(rname, aseq))\n", "\n", " # store this cluster\n", " if clust:\n", " clusters.append(\"\\n\".join(clust))\n", " idx += 1\n", "\n", " # if 1000 clusters stored then write to disk\n", " if not idx % 10000:\n", " if clusters:\n", " out.write(\"\\n//\\n//\\n\".join(clusters) + \"\\n//\\n//\\n\")\n", " clusters = []\n", "\n", "# write final remaining clusters to disk\n", "if clusters:\n", " out.write(\"\\n//\\n//\\n\".join(clusters) + \"\\n//\\n//\\n\")\n", "out.close()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "## branch to reduce size" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "## run step 6 until building clusters and check orientations" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "self.remote_index_refs()\n", "self.remote_run(\n", " function=concat_multiple_edits,\n", " printstr=(\"concatenating \", \"s3\"),\n", " args=(),\n", ")\n", "self.remote_run(\n", " function=merge_end_to_end,\n", " printstr=(\"join unmerged pairs \", \"s3\"),\n", " args=(False, False,),\n", ")\n", "self.remote_run(\n", " function=dereplicate,\n", " printstr=(\"dereplicating \", \"s3\"),\n", " args=(self.nthreads,),\n", " threaded=True,\n", ")\n", "self.remote_run(\n", " function=split_endtoend_reads,\n", " printstr=(\"splitting dereps \", \"s3\"),\n", " args=(),\n", ")\n", "self.remote_run(\n", " function=mapping_reads,\n", " printstr=(\"mapping reads \", \"s3\"),\n", " args=(self.nthreads,),\n", " threaded=True,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: tortas\n", "[####################] 100% 0:16:40 | loading reads | s1 |\n", "[####################] 100% 0:00:05 | processing reads | s2 |\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", " found an error in step2; see ipyrad_log.txt\n", "skipping samples not in state==2:\n", "['AGO02concat', 'AGO08concat', 'AGO09concat', 'AGO11concat', 'AGO12concat', 'AGO14concat', 'AGO18concat', 'AGO20concat', 'AGO23concat', 'AGO32concat', 'CAZ05concat', 'CAZ06concat', 'CAZ07concat', 'CAZ11concat', 'CAZ13concat', 'CAZ14concat', 'CAZ15concat', 'CF01concat', 'CF03concat', 'CF05concat', 'CF09concat', 'CF15concat', 'CF16concat', 'CRU02concat', 'CRU12concat', 'CRU14concat', 'CRU23concat', 'CRU38concat', 'CRU41concat', 'CRU43concat', 'CRU54concat', 'CRU61concat', 'CRU64concat', 'ESP01concat', 'ESP02concat', 'ESP04concat', 'ESP08concat', 'ESP09concat', 'ESP10concat', 'ESP11concat', 'ESP12concat', 'ESP13concat', 'ESP14concat', 'LP01concat', 'LP02concat', 'LP06concat', 'LP08concat', 'LT02concat', 'LT05concat', 'LT06concat', 'LT11concat', 'PBL_H01concat', 'PBL_H126concat', 'PBL_H27concat', 'PBL_H60concat', 'PBL_H86concat', 'PBL_I143concat', 'PBL_I188concat', 'PBR_B125concat', 'PBR_B152concat', 'PBR_D49concat', 'PBR_E18concat', 'PBR_F89concat', 'PBR_G244concat', 'PZ03concat', 'PZ04concat', 'PZ05concat', 'PZ06concat', 'PZ08concat', 'PZ09concat', 'PZ28concat', 'PZ67concat', 'PZ70concat', 'PZ82concat', 'SCR02concat', 'SCR05concat', 'SCR06concat', 'SCR07concat', 'SCR08concat', 'SCR10concat', 'SCR21concat', 'SCR22concat', 'SCR24concat', 'VA05concat', 'VA07concat', 'VA08concat', 'VA1082concat', 'VA373concat', 'VA378concat', 'VA387concat', 'VA423concat', 'VA984concat', 'VD02concat', 'VD05concat', 'VD06concat', 'VD08concat', 'VD13concat', 'VD14concat', 'VD15concat', 'VD23concat', 'VD24concat', 'VD25concat']\n", "\n", " Encountered an unexpected error (see ./ipyrad_log.txt)\n", " Error message is below -------------------------------\n", "no samples ready for step 3\n" ] } ], "source": [ "data.run(\"123\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "step.remote_concat_bams()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "step.remote_build_ref_regions()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "self = step" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Contig0', 76479, 77014),\n", " ('Contig0', 79123, 79378),\n", " ('Contig0', 81338, 81851),\n", " ('Contig0', 82205, 82795),\n", " ('Contig0', 83142, 83536),\n", " ('Contig0', 95644, 96211),\n", " ('Contig0', 101552, 101787),\n", " ('Contig0', 103468, 103708),\n", " ('Contig0', 107537, 108114),\n", " ('Contig0', 132703, 133571)]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "regs = [next(self.regions) for i in range(10)]\n", "regs" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AGO09concat_10:1:76480-76715\t0\t0\t76479\t0\t235M\t-1\t-1\t235\tCATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCGGCCATCTATCTACAATATGTATCTAAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTACACCTTCTTTTAAYAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATTTCAG\tNone\t[]\n", "AGO08concat_10:1:76480-77015\t0\t0\t76479\t0\t535M\t-1\t-1\t535\tCATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCGGCCATCTATCTACAATATGTATCTAAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTACACCTTCTTTTAACAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGAGAATCTCTATAAGCACTATAGGGCTCATGACACACAGCTGTGTCTTTCTGATTCCATCCAATAGGGGGCAGCAGAGCATATAAACATTAACCAGTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATATGCAGTTTTTCTTGTTCCCCTCCAGAGGGGTCAGTAATAAAACAATGGATATCATGGGATTAGAGCTGAAATT\tNone\t[]\n", "AGO11concat_10:1:76480-76711\t0\t0\t76479\t0\t231M\t-1\t-1\t231\tCATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCNGCCATCTATCTACAATATGTATCTAAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTACACCTTCTTTTAACAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT\tNone\t[]\n", "AGO02concat_9:1:76480-76711\t0\t0\t76479\t0\t231M\t-1\t-1\t231\tCATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCGGCCATCTATCTACAATATGTATCTAAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTACACCTTCTTTTAAYAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT\tNone\t[]\n", "AGO09concat_11:1:79124-79379\t0\t0\t79123\t0\t255M\t-1\t-1\t255\tCATGGACATAAGCCTTACGCCTCTCCTGGAAGTGGAGTTATGATGTCCGTGCAGTGGGGCGCTTACCTCGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGTGCTGGNCAAATGGTCGAAGACTTGTTCCAACCTCTCCACTCTTGAGACCCAAACTTTGGGTCTGGTGGGAATT\tNone\t[]\n", "AGO09concat_12:1:81339-81852\t0\t0\t81338\t0\t513M\t-1\t-1\t513\tCATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTATTACACKTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATTTGAAAAGTCAGGAAAAGAGCAGAGGCCGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG\tNone\t[]\n", "AGO08concat_11:1:81339-81852\t0\t0\t81338\t0\t513M\t-1\t-1\t513\tCATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGYTATAATTTGAAAAGTCAGGAAAAGAGCAGAGGCCGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG\tNone\t[]\n", "AGO11concat_11:1:81339-81852\t0\t0\t81338\t0\t513M\t-1\t-1\t513\tCATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATTTGAAAAGTCAGGAAAAGAGCAGAGGCTGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG\tNone\t[]\n", "AGO02concat_10:1:81339-81852\t0\t0\t81338\t0\t513M\t-1\t-1\t513\tCATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATTTGAAAAGTCAGGAAAAGAGCAGAGGCNGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG\tNone\t[]\n", "AGO08concat_12:1:82206-82699\t0\t0\t82205\t0\t493M\t-1\t-1\t493\tCATGCAATGCAACAAGGTCTGGAGCTCCTTAGCGAGCCTTCGAGCCACCCAGTCCCTGAAATACACCCCCTGGTCCCTTTCAGGCTGGATCCAGAATGGAAAGTGTAACACCAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\tNone\t[]\n", "AGO09concat_13:1:82469-82796\t0\t0\t82468\t0\t327M\t-1\t-1\t327\tAATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATGTGCTCATCCAGGCAGGAGCAAACCCTGAGCCCCGGCCACAGGTTCAAAGGTCTCAGCACCAGAGACAGGTGGAAAACCTTTTTCCCGTCCCGCAATT\tNone\t[]\n", "AGO11concat_12:1:82469-82699\t0\t0\t82468\t0\t230M\t-1\t-1\t230\tAATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\tNone\t[]\n", "AGO02concat_11:1:82469-82699\t0\t0\t82468\t0\t230M\t-1\t-1\t230\tAATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCKCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\tNone\t[]\n", "AGO09concat_14:1:83143-83367\t0\t0\t83142\t0\t224M\t-1\t-1\t224\tAGAAATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCRGTTTTGACGAACTGGTATTTGTTGCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCACCCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATG\tNone\t[]\n", "AGO11concat_13:1:83146-83367\t0\t0\t83145\t0\t221M\t-1\t-1\t221\tAATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCGGTTTTGACGAACTGGTATTTGTTGCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCACCCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATG\tNone\t[]\n", "AGO02concat_12:1:83146-83537\t0\t0\t83145\t0\t391M\t-1\t-1\t391\tAATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCRGTTTTGACGAACTGGTATTTGTTGCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCACCCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATGGCCAGCAAGGCTGGTCCATAGGGGCTGCAGGGCACGGCATTATGTCCTCCGGGGAACTTCTGCAGCGNNNNNNNNNNNNNNNNNNNNNNNNNNNTATTGCCATCCCCGCCTCTGGAACGGGACATAAAACCGGCTAGCTGGCAGGATGTCCTGAGCGCCCTGGCAGAATT\tNone\t[]\n", "AGO09concat_15:1:95645-95887\t0\t0\t95644\t0\t242M\t-1\t-1\t242\tAATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTACGTGTAAAGGGCATCCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATG\tNone\t[]\n", "AGO08concat_13:1:95645-95887\t0\t0\t95644\t0\t242M\t-1\t-1\t242\tAATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTASGTGTAAAGGGCATCCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATG\tNone\t[]\n", "AGO11concat_14:1:95645-96212\t0\t0\t95644\t0\t567M\t-1\t-1\t567\tAATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTACGTGTAAAGGGCATCCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATGGAGGTAAGGAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAAACCTCACAACAGCCTTGAAATCTAGTTCCATAGTCTCGAGCCCTTTGTACAAATAGGTATCCAGAAGCACAGAAACCTCAAGTGACTTGTCTCAGGTCACACCCGTATGGCACTGATGTAAGTGAACATACAGCATG\tNone\t[]\n", "AGO02concat_13:1:95645-96212\t0\t0\t95644\t0\t567M\t-1\t-1\t567\tAATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTACGTGTAAAGGGCATCCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAAACCTCACAACAGCCTTGAAATCTAGTTCCATCGTCTCGAGCCCTTTGTACAAATAGGTATCCAGAAGCACAGAAACCTCAAGTGACTTGTCTCAGGTCACACCCGTATGGCACTGATGTAAGTGAACATACAGCATG\tNone\t[]\n", "AGO09concat_16:1:101553-101788\t0\t0\t101552\t0\t235M\t-1\t-1\t235\tCATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGNCTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT\tNone\t[]\n", "AGO08concat_14:1:101553-101788\t0\t0\t101552\t0\t235M\t-1\t-1\t235\tCATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT\tNone\t[]\n", "AGO11concat_15:1:101553-101788\t0\t0\t101552\t0\t235M\t-1\t-1\t235\tCATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT\tNone\t[]\n", "AGO02concat_14:1:101553-101788\t0\t0\t101552\t0\t235M\t-1\t-1\t235\tCATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT\tNone\t[]\n", "AGO09concat_17:1:103469-103709\t0\t0\t103468\t0\t240M\t-1\t-1\t240\tCATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT\tNone\t[]\n", "AGO08concat_15:1:103469-103709\t0\t0\t103468\t0\t240M\t-1\t-1\t240\tCATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT\tNone\t[]\n", "AGO11concat_16:1:103469-103709\t0\t0\t103468\t0\t240M\t-1\t-1\t240\tCATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT\tNone\t[]\n", "AGO02concat_15:1:103469-103709\t0\t0\t103468\t0\t240M\t-1\t-1\t240\tCATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT\tNone\t[]\n", "AGO02concat_16:1:107538-108115\t0\t0\t107537\t0\t577M\t-1\t-1\t577\tCATGTTTAATCCTGAGAAGAGAAGACCAAGGGGGGACCCAATAACAGTCTTCAAATCTGTTAAGGGCTGTTATAGACAGGACGGTGATCAATTNNNNNNCATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATCTGCAGCAAGGGAGATACAGGGGAGATATTANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT\tNone\t[]\n", "AGO09concat_18:1:107637-108115\t0\t0\t107636\t0\t478M\t-1\t-1\t478\tCATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATCTGCAGCAAGGGAGATACAGGGTAGATATTANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT\tNone\t[]\n", "AGO08concat_16:1:107637-108115\t0\t0\t107636\t0\t478M\t-1\t-1\t478\tCATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT\tNone\t[]\n", "AGO11concat_17:1:107637-108115\t0\t0\t107636\t0\t478M\t-1\t-1\t478\tCATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATCTGCAGCAAGGGAGATACAGGGTAGATATTANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT\tNone\t[]\n", "AGO09concat_19:1:132704-133572\t0\t0\t132703\t0\t868M\t-1\t-1\t868\tCATGTATTAAACTTGAACACCTGCCCTTGCACTAGTAGGATTTTATGTTTGTATTTTGCATAACTTAGAATAAATAATAATGATATGATTTAACCGTATCCTGCCAGCCACCTTGGTAGAAATGCATCAGCCAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAATTCTGTTTTGTGTGNGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGATGGTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCNGTAATCAAAACCAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGAAACTTTGGGTTCATCCTGCCAAGACTTCCCCAGAGCGTAGTGTCGTGATCGACAGAACCCAGCTCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGACTCTGGACTAGTAACTATAACATCGACTGGTGGGAGAGTGTGCGTGTGAGTGATTGAATGAATGTGCTAATT\tNone\t[]\n", "AGO02concat_17:1:132704-133221\t0\t0\t132703\t0\t517M\t-1\t-1\t517\tCATGTATTAAACTTGAACACCTGCCCTTGCACTAGTAGGATTTTATGTTTGTATTTTGCATAACTTAGAATAAATAATAATGATATGATTTAACCGTATCCTGCCAGCCACCTTGGTAGAAATGCATCAGCCAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAATTCTGTTTTGTGTGTGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGATGGTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCTGTAATCAAAACCAGCATG\tNone\t[]\n", "AGO11concat_18:1:132993-133572\t0\t0\t132992\t0\t579M\t-1\t-1\t579\tAATTAATTCTGTTTTGTGTGTGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCTGTAATCAAAACCAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGAAACTTTGGGTTCATCCTGCCAAGACTTCCCCAGAGCGTAGTGTCGTGATCGACAGAACCCAGCTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGACTCTGGACTAGTAACTATAACATCGACTGGTGGGAGAGTGTGCGTGTGAGTGATTGAATGAATGTGCTAATT\tNone\t[]\n" ] } ], "source": [ "# access reads from bam file using pysam\n", "bamfile = AlignmentFile(\n", " os.path.join(\n", " self.data.dirs.across,\n", " \"cat.sorted.bam\"),\n", " 'rb')\n", "\n", "# catcons output file for raw clusters and db samples\n", "outfile = gzip.open(\n", " os.path.join(\n", " self.data.dirs.across,\n", " \"{}.catcons.gz\".format(self.data.name)),\n", " 'wb')\n", "\n", "# write header line to catcons with sample names\n", "outfile.write(\n", " b\" \".join([i.name.encode() for i in self.samples]) + b\"\\n\")\n", "\n", "# get clusters\n", "lidx = 0\n", "clusts = []\n", "# while 1:\n", "\n", "# try:\n", "# region = next(self.regions)\n", "# reads = bamfile.fetch(*region)\n", "# except StopIteration:\n", "# break\n", "\n", "for region in regs:\n", " reads = bamfile.fetch(*region)\n", " \n", " # get reference\n", " refn, refs = get_ref_region(\n", " data.paramsdict[\"reference_sequence\"], \n", " region[0], region[1]+1, region[2]+1) \n", " \n", " # build cluster dict for sorting \n", " rdict = {}\n", " for read in reads:\n", " print(read)\n", " rdict[read.qname] = read.seq \n", " keys = sorted(rdict.keys(), key=lambda x: x.rsplit(\":\", 2)[0])\n", " \n", " # build cluster based on map positions (reads are already oriented)\n", " arr = np.zeros((len(rdict) + 1, len(refs)), dtype=bytes)\n", " arr[0] = list(refs)\n", " for idx, key in enumerate(keys):\n", " rstart, rstop = key.rsplit(\":\", 1)[-1].split(\"-\")\n", " start = max(0, region[1] - (int(rstart) - 1))\n", " stop = min(len(refs), int(rstop) - int(rstart))\n", " #print(start, stop, len(rdict[key]), refn)\n", " #print(rdict[key])\n", " arr[idx + 1, int(start): int(stop)] = list(rdict[key]) \n", " arr[arr == b\"\"] = b\"N\"\n", " for line in arr:\n", " outfile.write(line.tostring() + b\"\\n\")\n", " #print(line.tostring(), file=outfile)\n", " #print(b\"\".join(line), file=outfile)\n", " outfile.write(b\"\\n\")\n", " \n", " \n", "outfile.close()" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "132703" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "region[1]" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGCTTGTATTAGAGATGGGGCTGAAGTCAAACCCTGACCTGAACAGTGAGAAAATCTGGTGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCCTCTGCCTCCCTCAAGAAGAAATCAAGAGKGGAAAGGAGCAGGGCGGGAGGTATGGGAAAGGAAGAATGGAATT'" ] }, "execution_count": 141, "metadata": {}, "output_type": "execute_result" } ], "source": [ "revcomp(\"AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCKCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\")" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "cannot copy sequence with size 231 to array axis with dimension 535", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0marr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrefs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 45\u001b[0;31m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 46\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[0;31m# get consens seq and variant site index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: cannot copy sequence with size 231 to array axis with dimension 535" ] } ], "source": [ " # get consens seq and variant site index \n", " clust = []\n", " avars = refvars(arr.view(np.uint8), PSEUDO_REF)\n", " dat = b\"\".join(avars.view(\"S1\")[:, 0]).decode()\n", " snpstring = \"\".join([\"*\" if i else \" \" for i in avars[:, 1]])\n", " clust.append(\"ref_{}:{}-{}\\n{}\".format(*region, dat))\n", "\n", " # or, just build variant string (or don't...)\n", " # write all loci with [locids, nsnps, npis, nindels, ?]\n", " for key in keys:\n", " clust.append(\"{}\\n{}\".format(key, rdict[key]))\n", " clust.append(\"SNPs\\n\" + snpstring)\n", " clusts.append(\"\\n\".join(clust))\n", "\n", " # advance locus counter\n", " lidx += 1\n", "\n", " # write chunks to file\n", " if not lidx % 1000:\n", " outfile.write(\n", " str.encode(\"\\n//\\n//\\n\".join(clusts) + \"\\n//\\n//\\n\"))\n", " clusts = []\n", "\n", "# write remaining\n", "if clusts: \n", " outfile.write(\n", " str.encode(\"\\n//\\n//\\n\".join(clusts) + \"\\n//\\n//\\n\"))\n", "outfile.close()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "cannot copy sequence with size 543 to array axis with dimension 559", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstep\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremote_build_ref_clusters\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/cluster_across.py\u001b[0m in \u001b[0;36mremote_build_ref_clusters\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0marr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 589\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 590\u001b[0;31m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 591\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[0;31m# get consens seq and variant site index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: cannot copy sequence with size 543 to array axis with dimension 559" ] } ], "source": [ "step.remote_build_ref_clusters()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.clustmap import *" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "self = Step3(data, 8, True, ipyclient)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | indexing reference | s3 |\n", "[####################] 100% 0:00:00 | concatenating | s3 |\n", "[####################] 100% 0:05:42 | join unmerged pairs | s3 |\n", "[####################] 100% 0:02:17 | dereplicating | s3 |\n", "[####################] 100% 0:00:39 | splitting dereps | s3 |\n", "[####################] 100% 1:53:03 | mapping reads | s3 |\n", "[####################] 100% 1:40:12 | building clusters | s3 |\n", "[####################] 100% 0:00:24 | calc cluster stats | s3 |\n" ] } ], "source": [ "self.run()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: 5-tortas\n", "[####################] 100% 0:15:41 | inferring [H, E] | s4 |\n", "[####################] 100% 0:00:26 | calculating depths | s5 |\n", "[####################] 100% 0:00:45 | chunking clusters | s5 |\n", "[####################] 100% 16:10:59 | consens calling | s5 |\n", "[####################] 100% 0:01:31 | indexing alleles | s5 |\n" ] } ], "source": [ "self.data.run(\"45\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | indexing reference | s3 |\n", "[####################] 100% 0:00:00 | concatenating | s3 |\n", "[####################] 100% 0:05:49 | join unmerged pairs | s3 |\n" ] } ], "source": [ "self.remote_index_refs()\n", "self.remote_run(\n", " function=concat_multiple_edits,\n", " printstr=(\"concatenating \", \"s3\"),\n", " args=(),\n", ")\n", "self.remote_run(\n", " function=merge_end_to_end,\n", " printstr=(\"join unmerged pairs \", \"s3\"),\n", " args=(False, False,),\n", ")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:02:22 | dereplicating | s3 |\n" ] } ], "source": [ "self.remote_run(\n", " function=dereplicate,\n", " printstr=(\"dereplicating \", \"s3\"),\n", " args=(self.nthreads,),\n", " threaded=True,\n", ")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:36 | splitting dereps | s3 |\n" ] } ], "source": [ "self.remote_run(\n", " function=split_endtoend_reads,\n", " printstr=(\"splitting dereps \", \"s3\"),\n", " args=(),\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[########## ] 50% 1:18:01 | mapping reads | s3 |" ] } ], "source": [ "self.remote_run(\n", " function=mapping_reads,\n", " printstr=(\"mapping reads \", \"s3\"),\n", " args=(self.nthreads,),\n", " threaded=True,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample = list(self.data.samples.values())[0]\n", "merge_end_to_end(self.data, sample, True, True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq'" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample = list(data.samples.values())[0]\n", "\n", "infiles = [\n", " os.path.join(data.dirs.edits, \"{}.trimmed_R1_.fastq.gz\".format(sample.name)),\n", " os.path.join(data.dirs.edits, \"{}_R1_concatedit.fq.gz\".format(sample.name)),\n", " os.path.join(data.tmpdir, \"{}_merged.fastq\".format(sample.name)),\n", " os.path.join(data.tmpdir, \"{}_declone.fastq\".format(sample.name)),\n", "]\n", "infiles = [i for i in infiles if os.path.exists(i)]\n", "infile = infiles[-1]\n", "\n", "infile" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/home/deren/Documents/ipyrad/bin/vsearch-linux-x86_64',\n", " '--derep_fulllength',\n", " '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq',\n", " '--strand',\n", " 'plus',\n", " '--output',\n", " '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_derep.fastq',\n", " '--threads',\n", " '2',\n", " '--fasta_width',\n", " '0',\n", " '--fastq_qmax',\n", " '1000',\n", " '--sizeout',\n", " '--relabel_md5']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "strand = \"plus\"\n", "if data.paramsdict[\"datatype\"] is ('gbs' or '2brad'):\n", " strand = \"both\"\n", "nthreads=2\n", "\n", "cmd = [\n", " ip.bins.vsearch,\n", " \"--derep_fulllength\", infile,\n", " \"--strand\", strand,\n", " \"--output\", os.path.join(data.tmpdir, sample.name + \"_derep.fastq\"),\n", " \"--threads\", str(nthreads),\n", " \"--fasta_width\", str(0),\n", " \"--fastq_qmax\", \"1000\",\n", " \"--sizeout\", \n", " \"--relabel_md5\",\n", "]\n", "cmd" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)\n", "errmsg = proc.communicate()[0]\n", "if proc.returncode:\n", " ip.logger.error(\"error inside dereplicate %s\", errmsg)\n", " raise IPyradWarningExit(errmsg)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ ] 0% 0:01:55 | dereplicating | s3 |" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprintstr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"dereplicating \"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"s3\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnthreads\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mthreaded\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m )\n", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36mremote_run\u001b[0;34m(self, printstr, function, args, threaded)\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0mready\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mrasyncs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrasyncs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_progressbar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprintstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 600\u001b[0;31m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 601\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 602\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "s.remote_run(\n", " function=dereplicate,\n", " printstr=(\"dereplicating \", \"s3\"),\n", " args=(s.nthreads,),\n", " threaded=True,\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }