{ "cells": [ { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import ipyrad as ip\n", "import ipyparallel as ipp" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: \n", " Controller appears to be listening on localhost, but not on this machine.\n", " If this is true, you should specify Client(...,sshserver='you@oud')\n", " or instruct your controller to listen on an external IP.\n", " RuntimeWarning)\n" ] } ], "source": [ "ipyclient = ipp.Client()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New Assembly: 1-pairtest\n" ] } ], "source": [ "data = ip.Assembly(\"1-pairtest\")\n", "data.set_params(\"raw_fastq_path\", \"ipsimdata/pairddrad_example_R*_.fastq.gz\")\n", "data.set_params(\"barcodes_path\", \"ipsimdata/pairddrad_example_barcodes.txt\")\n", "data.set_params(\"datatype\", \"pairddrad\")\n", "data.set_params(\"restriction_overhang\", (\"TGCAG\", \"CGG\"))\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New Assembly: 2-setest\n" ] } ], "source": [ "data = ip.Assembly(\"2-setest\")\n", "data.set_params(\"raw_fastq_path\", \"ipsimdata/rad_example_R1_.fastq.gz\")\n", "data.set_params(\"barcodes_path\", \"ipsimdata/rad_example_barcodes.txt\")\n", "data.set_params(\"datatype\", \"rad\")\n", "data.set_params(\"restriction_overhang\", (\"TGCAG\", \"\"))\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New Assembly: 3-refsetest\n" ] } ], "source": [ "data = ip.Assembly(\"3-refsetest\")\n", "data.set_params(\"raw_fastq_path\", \"ipsimdata/rad_example_R1_.fastq.gz\")\n", "data.set_params(\"barcodes_path\", \"ipsimdata/rad_example_barcodes.txt\")\n", "data.set_params(\"datatype\", \"rad\")\n", "data.set_params(\"assembly_method\", \"reference\")\n", "data.set_params(\"reference_sequence\", \"ipsimdata/rad_example_genome.fa\")\n", "data.set_params(\"restriction_overhang\", (\"TGCAG\", \"\"))\n", "#data.get_params()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New Assembly: 4-refpairtest\n" ] } ], "source": [ "data = ip.Assembly(\"4-refpairtest\")\n", "data.set_params(\"raw_fastq_path\", \"ipsimdata/pairddrad_example_R*_.fastq.gz\")\n", "data.set_params(\"barcodes_path\", \"ipsimdata/rad_example_barcodes.txt\")\n", "data.set_params(\"datatype\", \"pairddrad\")\n", "data.set_params(\"assembly_method\", \"reference\")\n", "data.set_params(\"reference_sequence\", \"ipsimdata/pairddrad_example_genome.fa\")\n", "data.set_params(\"restriction_overhang\", (\"TGCAG\", \"CGG\"))\n", "#data.get_params()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New Assembly: 5-tortas\n" ] } ], "source": [ "data = ip.Assembly(\"5-tortas\")\n", "data.set_params(\"project_dir\", \"tortas\")\n", "data.set_params(\"sorted_fastq_path\", \"/home/deren/Dropbox/Maud/fastq-concats/*.gz\")\n", "data.set_params(\"datatype\", \"pairddrad\")\n", "data.set_params(\"assembly_method\", \"reference\")\n", "data.set_params(\"reference_sequence\", \"/home/deren/Dropbox/Maud/lgeorge.genome.fa\")\n", "data.set_params(\"restriction_overhang\", (\"CATG\", \"AATT\"))\n", "data.set_params(\"filter_adapters\", 2)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: 4-refpairtest\n", "[force] overwriting fastq files previously created by ipyrad.\n", "This _does not_ affect your original/raw data files.\n", "[####################] 100% 0:00:04 | sorting reads | s1 |\n", "[####################] 100% 0:00:02 | writing/compressing | s1 |\n", "[####################] 100% 0:00:04 | processing reads | s2 |\n" ] } ], "source": [ "data.run(\"12\", force=True)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading Assembly: 5-tortas\n", "from saved path: ~/Documents/ipyrad/tests/tortas/5-tortas.json\n" ] } ], "source": [ "#data.run(\"12\", force=True)\n", "#data = ip.load_json(\"1-pairtest.json\")\n", "#data = ip.load_json(\"2-setest.json\")\n", "#data = ip.load_json(\"3-refsetest.json\")\n", "#data = ip.load_json(\"4-refpairtest.json\")\n", "data = ip.load_json(\"tortas/5-tortas.json\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statereads_rawreads_passed_filter
AGO02concat21105029410800672
AGO08concat21340840113030329
AGO09concat21565012715121047
AGO11concat21284893612370018
\n", "
" ], "text/plain": [ " state reads_raw reads_passed_filter\n", "AGO02concat 2 11050294 10800672\n", "AGO08concat 2 13408401 13030329\n", "AGO09concat 2 15650127 15121047\n", "AGO11concat 2 12848936 12370018" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.cluster_across import *\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.clustmap import *\n", "s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)\n", "samples = list(s3.data.samples.values())\n", "sample = samples[1]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:01 | concatenating | s3 |\n", "[####################] 100% 0:00:01 | join unmerged pairs | s3 |\n", "[####################] 100% 0:00:00 | dereplicating | s3 |\n", "[####################] 100% 0:00:00 | splitting dereps | s3 |\n", "[####################] 100% 0:00:02 | mapping reads | s3 |\n", "[####################] 100% 0:00:10 | building clusters | s3 |\n" ] } ], "source": [ "s3.data.ipcluster[\"threads\"] = 4\n", "s3.run()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: 4-refpairtest\n", "[####################] 100% 0:00:04 | inferring [H, E] | s4 |\n", "[####################] 100% 0:00:00 | calculating depths | s5 |\n", "[####################] 100% 0:00:00 | chunking clusters | s5 |\n", "[####################] 100% 0:00:25 | consens calling | s5 |\n" ] } ], "source": [ "s3.data.run(\"45\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "#s3.data.stats\n", "data = s3.data\n", "jobid = 0\n", "samples = list(data.samples.values())[:4]\n", "randomseed = 123" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/home/deren/Documents/ipyrad/tests/4-refpairtest_consens/1B_0.consens.gz',\n", " '/home/deren/Documents/ipyrad/tests/4-refpairtest_consens/1D_0.consens.gz',\n", " '/home/deren/Documents/ipyrad/tests/4-refpairtest_consens/2H_0.consens.gz',\n", " '/home/deren/Documents/ipyrad/tests/4-refpairtest_consens/3J_0.consens.gz']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conshandles = [\n", " sample.files.consens[0] for sample in samples if \n", " sample.stats.reads_consens]\n", "conshandles.sort()\n", "assert conshandles, \"no consensus files found\"\n", "conshandles" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "## concatenate all of the gzipped consens files\n", "cmd = ['cat'] + conshandles\n", "groupcons = os.path.join(\n", " data.dirs.across, \n", " \"{}-{}-catcons.gz\".format(data.name, jobid))\n", "LOGGER.debug(\" \".join(cmd))\n", "with open(groupcons, 'w') as output:\n", " call = sps.Popen(cmd, stdout=output, close_fds=True)\n", " call.communicate()\n", "\n", "## a string of sed substitutions for temporarily replacing hetero sites\n", "## skips lines with '>', so it doesn't affect taxon names\n", "subs = [\"/>/!s/W/A/g\", \"/>/!s/w/A/g\", \"/>/!s/R/A/g\", \"/>/!s/r/A/g\",\n", " \"/>/!s/M/A/g\", \"/>/!s/m/A/g\", \"/>/!s/K/T/g\", \"/>/!s/k/T/g\",\n", " \"/>/!s/S/C/g\", \"/>/!s/s/C/g\", \"/>/!s/Y/C/g\", \"/>/!s/y/C/g\"]\n", "subs = \";\".join(subs)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "## pipe passed data from gunzip to sed.\n", "cmd1 = [\"gunzip\", \"-c\", groupcons]\n", "cmd2 = [\"sed\", subs]\n", "LOGGER.debug(\" \".join(cmd1))\n", "LOGGER.debug(\" \".join(cmd2))\n", "\n", "proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)\n", "allhaps = groupcons.replace(\"-catcons.gz\", \"-cathaps.gz\")\n", "with open(allhaps, 'w') as output:\n", " proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)\n", " proc2.communicate()\n", "proc1.stdout.close()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "data.dirs.across = os.path.join(data.name + \"_across\")\n", "if not os.path.exists(data.dirs.across):\n", " os.makedirs(data.dirs.across)\n", " \n", "import ipyrad" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(None, None)" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conshandles = [\n", " sample.files.consens[0] for sample in samples if \n", " sample.stats.reads_consens]\n", "conshandles.sort()\n", "assert conshandles, \"no consensus files found\"\n", "\n", "## concatenate all of the gzipped consens files\n", "cmd = ['cat'] + conshandles\n", "groupcons = os.path.join(\n", " data.dirs.across, \n", " \"{}-{}-catcons.gz\".format(data.name, jobid))\n", "LOGGER.debug(\" \".join(cmd))\n", "with open(groupcons, 'w') as output:\n", " call = sps.Popen(cmd, stdout=output, close_fds=True)\n", " call.communicate()\n", "\n", "## a string of sed substitutions for temporarily replacing hetero sites\n", "## skips lines with '>', so it doesn't affect taxon names\n", "subs = [\"/>/!s/W/A/g\", \"/>/!s/w/A/g\", \"/>/!s/R/A/g\", \"/>/!s/r/A/g\",\n", " \"/>/!s/M/A/g\", \"/>/!s/m/A/g\", \"/>/!s/K/T/g\", \"/>/!s/k/T/g\",\n", " \"/>/!s/S/C/g\", \"/>/!s/s/C/g\", \"/>/!s/Y/C/g\", \"/>/!s/y/C/g\"]\n", "subs = \";\".join(subs)\n", "\n", "## impute pseudo-haplo information to avoid mismatch at hetero sites\n", "## the read data with hetero sites is put back into clustered data later.\n", "## pipe passed data from gunzip to sed.\n", "cmd1 = [\"gunzip\", \"-c\", groupcons]\n", "cmd2 = [\"sed\", subs]\n", "LOGGER.debug(\" \".join(cmd1))\n", "LOGGER.debug(\" \".join(cmd2))\n", "\n", "proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)\n", "allhaps = groupcons.replace(\"-catcons.gz\", \"-cathaps.gz\")\n", "with open(allhaps, 'w') as output:\n", " proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)\n", " proc2.communicate()\n", "proc1.stdout.close()\n", "\n", "## now sort the file using vsearch\n", "allsort = groupcons.replace(\"-catcons.gz\", \"-catsort.fa\")\n", "cmd1 = [ipyrad.bins.vsearch,\n", " \"--sortbylength\", allhaps,\n", " \"--fasta_width\", \"0\",\n", " \"--output\", allsort]\n", "LOGGER.debug(\" \".join(cmd1))\n", "proc1 = sps.Popen(cmd1, close_fds=True)\n", "proc1.communicate()\n" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.cluster_across import *\n", "\n", "random.seed(randomseed)\n", "\n", "## open an iterator to lengthsorted file and grab two lines at at time\n", "allshuf = groupcons.replace(\"-catcons.gz\", \"-catshuf.fa\")\n", "outdat = open(allshuf, 'wt')\n", "indat = open(allsort, 'r')\n", "idat = izip(iter(indat), iter(indat))\n", "done = 0\n", "\n", "chunk = [next(idat)]\n", "while not done:\n", " ## grab 2-lines until they become shorter (unless there's only one)\n", " oldlen = len(chunk[-1][-1])\n", " while 1:\n", " try:\n", " dat = next(idat)\n", " except StopIteration:\n", " done = 1\n", " break\n", " if len(dat[-1]) == oldlen:\n", " chunk.append(dat)\n", " else:\n", " ## send the last chunk off to be processed\n", " random.shuffle(chunk)\n", " outdat.write(\"\".join(chain(*chunk)))\n", " ## start new chunk\n", " chunk = [dat]\n", " break\n", "\n", "## do the last chunk\n", "random.shuffle(chunk)\n", "outdat.write(\"\".join(chain(*chunk)))\n", "\n", "indat.close()\n", "outdat.close()" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "\n", "def build_clusters_from_cigars(data, sample):\n", " \n", " # get all regions with reads. Generator to yield (str, int, int)\n", " fullregions = bedtools_merge(data, sample).strip().split(\"\\n\") \n", " regions = (i.split(\"\\t\") for i in fullregions)\n", " regions = ((i, int(j), int(k)) for (i, j, k) in regions)\n", "\n", " # access reads from bam file using pysam\n", " bamfile = AlignmentFile(\n", " os.path.join(data.dirs.refmapping, \n", " \"{}-mapped-sorted.bam\".format(sample.name)),\n", " 'rb')\n", "\n", " # iterate over all regions\n", " opath = os.path.join(\n", " data.dirs.clusts, \"{}.clustS.gz\".format(sample.name))\n", " out = gzip.open(opath, 'wt')\n", " idx = 0\n", " clusters = []\n", " for reg in regions:\n", " # uncomment and compare against ref sequence when testing\n", " #ref = get_ref_region(data.paramsdict[\"reference_sequence\"], *reg)\n", " reads = bamfile.fetch(*reg)\n", "\n", " # store reads in a dict\n", " rdict = {}\n", "\n", " # paired-end data cluster building\n", " if \"pair\" in data.paramsdict[\"datatype\"]:\n", " \n", " # match paired reads together in a dictionary \n", " for read in reads:\n", " if read.qname not in rdict:\n", " rdict[read.qname] = [read, None]\n", " else:\n", " rdict[read.qname][1] = read\n", "\n", " # sort keys by derep number\n", " keys = sorted(\n", " rdict.keys(),\n", " key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n", "\n", " # build the cluster based on map positions, orientation, cigar\n", " clust = []\n", " for key in keys:\n", " r1, r2 = rdict[key]\n", " if r1 and r2:\n", "\n", " #lref = len(ref[1])\n", " lref = reg[2] - reg[1]\n", " arr1 = np.zeros(lref, dtype=\"U1\")\n", " arr2 = np.zeros(lref, dtype=\"U1\")\n", " arr1.fill(\"-\")\n", " arr2.fill(\"-\")\n", "\n", " # how far ahead of the start does this read begin\n", " seq = cigared(r1.seq, r1.cigar)\n", " start = r1.reference_start - reg[1] \n", " arr1[start:start + len(seq)] = list(seq)\n", " \n", " seq = cigared(r2.seq, r2.cigar)\n", " start = r2.reference_start - reg[1] \n", " arr2[start:start + len(seq)] = list(seq)\n", " \n", " arr3 = join_arrays(arr1, arr2)\n", " pairseq = \"\".join(arr3)\n", "\n", " ori = \"+\"\n", " if r1.is_reverse:\n", " ori = \"-\"\n", " derep = r1.qname.split(\"=\")[-1]\n", " rname = \"{}:{}-{};size={};{}\".format(*reg, derep, ori)\n", " clust.append(\"{}\\n{}\".format(rname, pairseq))\n", "\n", " # single-end data cluster building\n", " else: \n", " for read in reads:\n", " rdict[read.qname] = read\n", "\n", " # sort keys by derep number\n", " keys = sorted(\n", " rdict.keys(),\n", " key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n", "\n", " # build the cluster based on map positions, orientation, cigar\n", " clust = []\n", " for key in keys:\n", " r1 = rdict[key]\n", "\n", " #aref = np.array(list(ref[1]))\n", " lref = reg[2] - reg[1]\n", " arr1 = np.zeros(lref, dtype=\"U1\")\n", " arr1.fill(\"-\")\n", "\n", " # how far ahead of the start does this read begin\n", " seq = cigared(r1.seq, r1.cigar)\n", " start = r1.reference_start - reg[1] \n", " arr1[start:start + len(seq)] = list(seq)\n", " aseq = \"\".join(arr1)\n", "\n", " ori = \"+\"\n", " if r1.is_reverse:\n", " ori = \"-\"\n", " derep = r1.qname.split(\"=\")[-1]\n", " rname = \"{}:{}-{};size={};{}\".format(*reg, derep, ori)\n", " clust.append(\"{}\\n{}\".format(rname, aseq))\n", "\n", " # store this cluster\n", " clusters.append(\"\\n\".join(clust))\n", " idx += 1\n", "\n", " # if 1000 clusters stored then write to disk\n", " if not idx % 10:\n", " out.write(\"\\n//\\n//\\n\".join(clusters) + \"\\n//\\n//\\n\")\n", " clusters = []\n", " \n", " # write final remaining clusters to disk\n", " out.write(\"\\n//\\n//\\n\".join(clusters) + \"\\n//\\n//\\n\")\n", " out.close()\n" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "build_clusters_from_cigars(data, sample)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#maxlens, depths = get_quick_depths(data, sample)\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#sample_cleanup(data, sample)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "def get_quick_depths(data, sample):\n", " \"\"\" iterate over clustS files to get data \"\"\"\n", "\n", " ## use existing sample cluster path if it exists, since this\n", " ## func can be used in step 4 and that can occur after merging\n", " ## assemblies after step3, and if we then referenced by data.dirs.clusts\n", " ## the path would be broken.\n", " if sample.files.clusters:\n", " pass\n", " else:\n", " ## set cluster file handles\n", " sample.files.clusters = os.path.join(\n", " data.dirs.clusts, sample.name + \".clustS.gz\")\n", "\n", " ## get new clustered loci\n", " fclust = data.samples[sample.name].files.clusters\n", " clusters = gzip.open(fclust, 'rt')\n", " pairdealer = izip(*[iter(clusters)] * 2)\n", "\n", " ## storage\n", " depths = []\n", " maxlen = []\n", "\n", " ## start with cluster 0\n", " tdepth = 0\n", " tlen = 0\n", "\n", " ## iterate until empty\n", " while 1:\n", " ## grab next\n", " try:\n", " name, seq = next(pairdealer)\n", " except StopIteration:\n", " break\n", "\n", " ## if not the end of a cluster\n", " #print name.strip(), seq.strip()\n", " if name.strip() == seq.strip():\n", " depths.append(tdepth)\n", " maxlen.append(tlen)\n", " tlen = 0\n", " tdepth = 0\n", "\n", " else:\n", " try:\n", " tdepth += int(name.strip().split(\"=\")[-1][:-2])\n", " tlen = len(seq)\n", " except:\n", " print(name)\n", "\n", " ## return\n", " clusters.close()\n", " return np.array(maxlen), np.array(depths)\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "def sample_cleanup(data, sample):\n", " \"\"\" stats, cleanup, and link to samples \"\"\"\n", "\n", " ## get maxlen and depths array from clusters\n", " maxlens, depths = get_quick_depths(data, sample)\n", "\n", " ## Test if depths is non-empty, but just full of zeros.\n", " if not depths.max():\n", " print(\" no clusters found for {}\".format(sample.name))\n", " return \n", "\n", " else:\n", " ## store which min was used to calculate hidepth here\n", " sample.stats_dfs.s3[\"hidepth_min\"] = (\n", " data.paramsdict[\"mindepth_majrule\"])\n", "\n", " # If our longest sequence is longer than the current max_fragment_len\n", " # then update max_fragment_length. For assurance we require that\n", " # max len is 4 greater than maxlen, to allow for pair separators.\n", " hidepths = depths >= data.paramsdict[\"mindepth_majrule\"]\n", " maxlens = maxlens[hidepths]\n", "\n", " ## Handle the case where there are no hidepth clusters\n", " if maxlens.any():\n", " maxlen = int(maxlens.mean() + (2. * maxlens.std()))\n", " else:\n", " maxlen = 0\n", " if maxlen > data._hackersonly[\"max_fragment_length\"]:\n", " data._hackersonly[\"max_fragment_length\"] = maxlen + 4\n", "\n", " ## make sense of stats\n", " keepmj = depths[depths >= data.paramsdict[\"mindepth_majrule\"]]\n", " keepstat = depths[depths >= data.paramsdict[\"mindepth_statistical\"]]\n", "\n", " ## sample summary stat assignments\n", " sample.stats[\"state\"] = 3\n", " sample.stats[\"clusters_total\"] = depths.shape[0]\n", " sample.stats[\"clusters_hidepth\"] = keepmj.shape[0]\n", "\n", " ## store depths histogram as a dict. Limit to first 25 bins\n", " bars, bins = np.histogram(depths, bins=range(1, 26))\n", " sample.depths = {int(i): int(v) for i, v in zip(bins, bars) if v}\n", "\n", " ## sample stat assignments\n", " ## Trap numpy warnings (\"mean of empty slice\") printed by samples\n", " ## with few reads.\n", " with warnings.catch_warnings():\n", " warnings.simplefilter(\"ignore\", category=RuntimeWarning)\n", " sample.stats_dfs.s3[\"merged_pairs\"] = sample.stats.reads_merged\n", " sample.stats_dfs.s3[\"clusters_total\"] = depths.shape[0]\n", " try:\n", " sample.stats_dfs.s3[\"clusters_hidepth\"] = (\n", " int(sample.stats[\"clusters_hidepth\"]))\n", " except ValueError:\n", " ## Handle clusters_hidepth == NaN\n", " sample.stats_dfs.s3[\"clusters_hidepth\"] = 0\n", " sample.stats_dfs.s3[\"avg_depth_total\"] = depths.mean()\n", " sample.stats_dfs.s3[\"avg_depth_mj\"] = keepmj.mean()\n", " sample.stats_dfs.s3[\"avg_depth_stat\"] = keepstat.mean()\n", " sample.stats_dfs.s3[\"sd_depth_total\"] = depths.std()\n", " sample.stats_dfs.s3[\"sd_depth_mj\"] = keepmj.std()\n", " sample.stats_dfs.s3[\"sd_depth_stat\"] = keepstat.std()\n", "\n", " ## Get some stats from the bam files\n", " ## This is moderately hackish. samtools flagstat returns\n", " ## the number of reads in the bam file as the first element\n", " ## of the first line, this call makes this assumption.\n", " if not data.paramsdict[\"assembly_method\"] == \"denovo\":\n", " ## shorter names\n", " mapf = os.path.join(\n", " data.dirs.refmapping, sample.name + \"-mapped-sorted.bam\")\n", " umapf = os.path.join(\n", " data.dirs.refmapping, sample.name + \"-unmapped.bam\")\n", "\n", " ## get from unmapped\n", " cmd1 = [ip.bins.samtools, \"flagstat\", umapf]\n", " proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)\n", " result1 = proc1.communicate()[0]\n", "\n", " ## get from mapped\n", " cmd2 = [ip.bins.samtools, \"flagstat\", mapf]\n", " proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)\n", " result2 = proc2.communicate()[0]\n", "\n", " ## store results\n", " ## If PE, samtools reports the _actual_ number of reads mapped, both \n", " ## R1 and R2, so here if PE divide the results by 2 to stay consistent\n", " ## with how we've been reporting R1 and R2 as one \"read pair\"\n", " if \"pair\" in data.paramsdict[\"datatype\"]:\n", " sample.stats[\"refseq_unmapped_reads\"] = int(result1.split()[0]) / 2\n", " sample.stats[\"refseq_mapped_reads\"] = int(result2.split()[0]) / 2\n", " else:\n", " sample.stats[\"refseq_unmapped_reads\"] = int(result1.split()[0])\n", " sample.stats[\"refseq_mapped_reads\"] = int(result2.split()[0])\n", "\n", " unmapped = os.path.join(data.dirs.refmapping, sample.name + \"-unmapped.bam\")\n", " samplesam = os.path.join(data.dirs.refmapping, sample.name + \".sam\")\n", " for rfile in [unmapped, samplesam]:\n", " if os.path.exists(rfile):\n", " os.remove(rfile)\n", "\n", " # if loglevel==DEBUG\n", " log_level = ip.logger.getEffectiveLevel()\n", " if not log_level == 10:\n", " ## Clean up loose files only if not in DEBUG\n", " ##- edits/*derep, utemp, *utemp.sort, *htemp, *clust.gz\n", " derepfile = os.path.join(data.dirs.edits, sample.name + \"_derep.fastq\")\n", " mergefile = os.path.join(data.dirs.edits, sample.name + \"_merged_.fastq\")\n", " uhandle = os.path.join(data.dirs.clusts, sample.name + \".utemp\")\n", " usort = os.path.join(data.dirs.clusts, sample.name + \".utemp.sort\")\n", " hhandle = os.path.join(data.dirs.clusts, sample.name + \".htemp\")\n", " clusters = os.path.join(data.dirs.clusts, sample.name + \".clust.gz\")\n", "\n", " for rfile in [derepfile, mergefile, uhandle, usort, hhandle, clusters]:\n", " if os.path.exists(rfile):\n", " os.remove(rfile)\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# optimize speed of this next\n", "build_clusters_from_cigars(data, sample)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Consensus for refmapped data -- store all (even long pairs)?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### Write each out to a sam file...\n", "# newconsensus()\n", "# \n", "data._este = data.stats.error_est.mean()\n", "data._esth = data.stats.hetero_est.mean()\n", "\n", "clusters = open(os.path.join(data.dirs.clusts, \"{}.clustS.gz\".format(sample.name)), 'r')\n", "clusters.read()\n", "\n", "# plan to fill an h5 for this sample\n", "tmp5 = consenshandle.replace(\"_tmpcons.\", \"_tmpcats.\")\n", "with h5py.File(tmp5, 'w') as io5:\n", " io5.create_dataset(\"cats\", (optim, maxlen, 4), dtype=np.uint32)\n", " io5.create_dataset(\"alls\", (optim, ), dtype=np.uint8)\n", " io5.create_dataset(\"chroms\", (optim, 3), dtype=np.int64)\n", "\n", " ## local copies to use to fill the arrays\n", " catarr = io5[\"cats\"][:]\n", " nallel = io5[\"alls\"][:]\n", " refarr = io5[\"chroms\"][:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### Step 6 for refmapped pairs: \n", "# 1. convert all sams to bams and make a merged mapped-sorted.bam\n", "# 2. get overlapping regions with bedtools_merge()\n", "# 3. pull consens reads in aligned regions with bamfile.fetch()\n", "# 4. store the consensus sequence in h5.\n", "# 5. store the variants in h5.\n", "# 6. store the depth of variants in h5.\n", "# 7. " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# get all regions with reads. Generator to yield (str, int, int)\n", "fullregions = bedtools_merge(data, sample).strip().split(\"\\n\") \n", "regions = (i.split(\"\\t\") for i in fullregions)\n", "regions = ((i, int(j), int(k)) for (i, j, k) in regions)\n", "\n", "# access reads from bam file using pysam\n", "samfile = AlignmentFile(\n", " os.path.join(data.dirs.refmapping, \n", " \"{}-mapped-sorted.bam\".format(sample.name)),\n", " 'rb')\n" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "reg = next(regions)\n", "ref = get_ref_region(data.paramsdict[\"reference_sequence\"], *reg)\n", "reads = samfile.fetch(*reg)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "\n", "# match paired reads together in a dictionary\n", "rdict = {}\n", "for read in reads:\n", " rdict[read.qname] = read\n", "\n", "# sort keys by derep number\n", "keys = sorted(\n", " rdict.keys(),\n", " key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n", "\n", "# build the cluster based on map positions, orientation, cigar\n", "clust = []\n", "for key in keys:\n", " r1 = rdict[key]\n", "\n", " aref = np.array(list(ref[1]))\n", " arr1 = np.zeros(aref.size, dtype=\"U1\")\n", " arr1.fill(\"-\")\n", "\n", " # how far ahead of the start does this read begin\n", " seq = cigared(r1.seq, r1.cigar)\n", " start = r1.reference_start - reg[1] \n", " arr1[start:start + len(seq)] = list(seq)\n", " aseq = \"\".join(arr1)\n", "\n", " ori = \"+\"\n", " if r1.is_reverse:\n", " ori = \"-\"\n", " derep = r1.qname.split(\"=\")[-1]\n", " rname = \"{}:{}-{};size={};{}\".format(*reg, derep, ori)\n", " clust.append(\"{}\\n{}\".format(rname, aseq))" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['MT:96809-96900;size=18;+\\nTGCAGCTGCGGTAGTTAACGAACAGCCTGTCTTGTCTAAAGGGTTAAAAATCAGGTCCGGTGTACAGGCGACGATAGAGAACCCCGGCCTA',\n", " 'MT:96809-96900;size=1;+\\nTGCAGCTGCGGTAGTTAACGAACAGCCTGTCTTGTCTAAAGGGTTAAAAATCAGGTCCGGTGTACAAGCGACGATAGAGAACCCCGGCCTA',\n", " 'MT:96809-96900;size=1;+\\nTGCAGCTGCGGTAGTTAACGAACAGCCTGTCTTGTCTAAAGGGTTAAAAATCAGGTCCGGTGTACAGGCGACGACAGAGAACCCCGGCCTA']" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clust" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'37013bcb5c4c2a2541dacf4f2e807af0;size=16': [,\n", " None]}" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rdict" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "ename": "IPyradError", "evalue": "None", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIPyradError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmapping_reads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36mmapping_reads\u001b[0;34m(data, sample, nthreads)\u001b[0m\n\u001b[1;32m 1498\u001b[0m \u001b[0;31m# -O = Output file format, in this case bam\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1499\u001b[0m \u001b[0;31m# -o = Output file name\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1500\u001b[0;31m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1501\u001b[0m \u001b[0;31m# (cmd5) samtools bam2fq -v 45 [in.bam]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1502\u001b[0m \u001b[0;31m# -v45 set the default qscore arbirtrarily high\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIPyradError\u001b[0m: None" ] } ], "source": [ "mapping_reads(data, sample, 2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: 4-refpairtest\n", "[####################] 100% 0:00:00 | calculating depths | s5 |\n", "[####################] 100% 0:00:00 | chunking clusters | s5 |\n", "[####################] 100% 0:00:24 | consens calling | s5 |\n" ] } ], "source": [ "s3.data.run(\"5\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "subsamples = list(s3.data.samples.values())\n", "subsamples.sort(key=lambda x: x.stats.clusters_hidepth, reverse=True)\n", "jobs = {}" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "ename": "IndexError", "evalue": "index 499 is out of bounds for axis 0 with size 499", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mipyrad\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massemble\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjointestimate\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0moptim\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/jointestimate.py\u001b[0m in \u001b[0;36moptim\u001b[0;34m(data, sample)\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[0;31m## get array of all clusters data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 280\u001b[0;31m \u001b[0mstacked\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstackarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 281\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[0;31m## get base frequencies\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/jointestimate.py\u001b[0m in \u001b[0;36mstackarray\u001b[0;34m(data, sample)\u001b[0m\n\u001b[1;32m 254\u001b[0m dtype=np.uint64).T\n\u001b[1;32m 255\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 256\u001b[0;31m \u001b[0mstacked\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnclust\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mcatg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcatg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 257\u001b[0m \u001b[0mnclust\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIndexError\u001b[0m: index 499 is out of bounds for axis 0 with size 499" ] } ], "source": [ "from ipyrad.assemble.jointestimate import *\n", "optim(data, sample)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(, 499, 495, 499, 495)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample, _, _, nhidepth, maxlen = recal_hidepth(data, sample)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "data\n", "concat_multiple_edits(data, sample)\n", "merge_pairs_with_vsearch(data, sample, True)\n", "merge_end_to_end(data, sample, True, True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "dereplicate(data, sample, 2)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "cluster(data, sample, 2, 1)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "build_clusters(data, sample, 5)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "muscle_chunker(data, sample)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "for idx in range(10):\n", " handle = os.path.join(s3.data.tmpdir, \n", " \"{}_chunk_{}.ali\".format(sample.name, idx))\n", " align_and_parse(handle, 5, 0)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "reconcat(data, sample)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:01 | concatenating | s3 |\n", "[####################] 100% 0:00:01 | join unmerged pairs | s3 |\n", "[####################] 100% 0:00:00 | dereplicating | s3 |\n", "[####################] 100% 0:00:00 | splitting dereps | s3 |\n", "[####################] 100% 0:00:02 | mapping reads | s3 |\n", "[####################] 100% 0:00:37 | building clusters | s3 |\n" ] } ], "source": [ "s3.run()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([460, 475, 470, 482, 468, 463, 488, 445, 474, 490, 446, 469, 482,\n", " 457, 457, 470, 477, 445, 442, 468, 486, 447, 473, 460, 484, 467,\n", " 482, 442, 446, 447, 458, 471, 455, 476, 480, 443, 461, 457, 479,\n", " 452, 444, 464, 473, 477, 448, 446, 453, 488, 490, 482, 471, 483,\n", " 480, 450, 456, 482, 454, 462, 457, 456, 455, 463, 447, 488, 457,\n", " 446, 472, 450, 472, 482, 472, 451, 491, 479, 469, 446, 489, 486,\n", " 463, 463, 489, 476, 458, 484, 463, 458, 443, 479, 488, 469, 470,\n", " 474, 454, 447, 447, 489, 451, 449, 453, 491, 484, 471, 483, 481,\n", " 483, 472, 468, 457, 464, 446, 490, 462, 452, 461, 457, 474, 469,\n", " 481, 458, 461, 446, 472, 455, 455, 488, 471, 482, 489, 466, 461,\n", " 473, 477, 466, 488, 461, 487, 453, 466, 450, 472, 458, 448, 490,\n", " 484, 476, 445, 488, 451, 484, 486, 463, 469, 452, 462, 453, 476,\n", " 448, 445, 468, 453, 463, 469, 487, 445, 453, 469, 447, 444, 464,\n", " 488, 468, 448, 461, 465, 483, 446, 447, 473, 448, 448, 444, 481,\n", " 464, 474, 464, 459, 461, 458, 451, 449, 468, 445, 491, 443, 448,\n", " 475, 450, 454, 454, 467, 478, 452, 451, 468, 480, 468, 464, 484,\n", " 462, 484, 475, 457, 463, 453, 473, 457, 486, 469, 485, 462, 464,\n", " 442, 475, 473, 470, 482, 444, 456, 479, 472, 477, 446, 443, 471,\n", " 473, 489, 467, 463, 476, 481, 448, 450, 460, 452, 458, 479, 444,\n", " 457, 489, 488, 444, 453, 472, 461, 481, 451, 450, 491, 473, 480,\n", " 491, 489, 468, 458, 472, 460, 450, 462, 447, 457, 465, 452, 468,\n", " 475, 465, 491, 447, 453, 448, 471, 490, 472, 446, 450, 478, 468,\n", " 455, 470, 454, 464, 479, 472, 456, 475, 479, 446, 474, 470, 474,\n", " 446, 491, 479, 470, 457, 457, 482, 453, 468, 457, 454, 453, 445,\n", " 489, 477, 451, 478, 485, 456, 485, 449, 485, 478, 468, 462, 454,\n", " 486, 482, 471, 488, 444, 485, 450, 485, 449, 444, 453, 482, 484,\n", " 472, 456, 474, 461, 487, 468, 482, 480, 459, 443, 489, 447, 444,\n", " 453, 447, 470, 486, 442, 442, 487, 445, 471, 480, 475, 450, 456,\n", " 473, 468, 455, 452, 491, 470, 479, 469, 469, 442, 489, 451, 476,\n", " 464, 477, 487, 484, 468, 454, 462, 460, 476, 442, 476, 489, 481,\n", " 477, 485, 456, 453, 476, 471, 473, 453, 480, 490, 479, 488, 470,\n", " 483, 483, 450, 451, 459, 469, 456, 461, 458, 451, 450, 446, 482,\n", " 446, 448, 472, 489, 485, 489, 449, 477, 488, 452, 460, 450, 471,\n", " 444, 452, 456, 452, 467, 455, 489, 481, 474, 463, 456, 475, 453,\n", " 445, 487, 481, 445, 476, 476, 483, 456, 455, 444, 443, 452, 478,\n", " 488, 452, 471, 452, 483, 484, 490, 457, 487, 464, 455, 480, 491,\n", " 475, 449, 489, 447, 468, 468, 485, 475, 484, 455, 444, 450, 466,\n", " 448, 442, 459, 488, 482, 455, 474, 480, 482, 474, 484, 488, 469,\n", " 475, 481, 482, 446, 473]),\n", " array([17, 14, 17, 20, 25, 21, 18, 25, 21, 24, 22, 23, 21, 22, 23, 20, 17,\n", " 22, 19, 20, 25, 26, 17, 21, 19, 25, 19, 21, 20, 16, 21, 18, 22, 22,\n", " 20, 21, 22, 23, 20, 19, 19, 17, 16, 27, 21, 20, 17, 19, 20, 18, 15,\n", " 20, 24, 20, 19, 19, 20, 23, 20, 23, 20, 19, 20, 23, 22, 25, 17, 27,\n", " 22, 26, 22, 19, 21, 18, 21, 20, 19, 20, 25, 19, 19, 16, 23, 19, 17,\n", " 18, 18, 21, 16, 19, 26, 22, 22, 23, 18, 22, 21, 24, 18, 23, 19, 18,\n", " 20, 18, 21, 22, 21, 22, 19, 19, 18, 20, 17, 19, 22, 25, 20, 21, 19,\n", " 21, 19, 20, 24, 19, 22, 22, 20, 19, 21, 20, 17, 21, 19, 19, 22, 19,\n", " 19, 16, 19, 22, 18, 19, 15, 18, 18, 20, 20, 25, 17, 21, 19, 18, 18,\n", " 22, 21, 21, 19, 21, 14, 25, 19, 17, 20, 21, 14, 20, 21, 20, 18, 25,\n", " 27, 16, 19, 18, 21, 22, 15, 18, 19, 21, 21, 19, 17, 20, 19, 20, 22,\n", " 24, 18, 18, 18, 25, 19, 19, 19, 17, 26, 18, 20, 22, 20, 22, 22, 21,\n", " 21, 21, 21, 18, 18, 19, 19, 19, 25, 17, 22, 21, 17, 18, 18, 18, 20,\n", " 22, 17, 19, 19, 20, 25, 19, 25, 18, 17, 25, 15, 19, 19, 23, 19, 22,\n", " 23, 17, 15, 22, 16, 20, 24, 23, 19, 18, 24, 19, 20, 20, 19, 19, 18,\n", " 26, 18, 18, 23, 20, 18, 21, 22, 25, 23, 16, 19, 25, 24, 22, 21, 20,\n", " 20, 18, 25, 17, 18, 14, 17, 17, 18, 19, 17, 19, 12, 15, 24, 16, 19,\n", " 15, 18, 20, 21, 17, 19, 21, 26, 19, 22, 19, 19, 21, 29, 19, 22, 21,\n", " 22, 19, 23, 17, 25, 20, 21, 20, 23, 18, 21, 19, 27, 14, 18, 16, 19,\n", " 21, 17, 25, 17, 19, 21, 20, 23, 22, 18, 24, 20, 20, 21, 25, 15, 18,\n", " 17, 21, 27, 20, 24, 23, 21, 20, 26, 20, 23, 20, 21, 23, 21, 19, 18,\n", " 21, 19, 21, 15, 22, 22, 23, 21, 19, 20, 14, 21, 25, 18, 20, 13, 17,\n", " 26, 23, 19, 18, 14, 22, 25, 21, 17, 23, 23, 17, 20, 19, 22, 20, 21,\n", " 18, 17, 17, 19, 26, 19, 18, 21, 20, 19, 22, 15, 20, 14, 19, 22, 23,\n", " 20, 17, 23, 16, 17, 18, 18, 20, 16, 16, 14, 22, 19, 19, 22, 16, 22,\n", " 24, 18, 21, 22, 18, 22, 20, 25, 17, 16, 25, 27, 20, 22, 22, 23, 21,\n", " 21, 23, 15, 17, 22, 21, 23, 17, 16, 26, 20, 17, 22, 24, 21, 21, 24,\n", " 21, 20, 21, 21, 21, 19, 24, 18, 22, 23, 21, 15, 18, 21, 27, 14, 17,\n", " 21, 23, 26, 23, 25, 22, 20, 20, 18, 19, 20, 21, 25, 13, 21, 17, 19,\n", " 18, 21, 19, 17, 21, 19]))" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "maxlens, depths = get_quick_depths(data, sample)\n", "maxlens, depths" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "data = s3.data\n", "samples = list(s3.data.samples.values())\n", "sample = samples[1]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "sample = samples[1]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "concat_multiple_edits(data, sample)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "merge_end_to_end(data, sample, False)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "dereplicate(data, sample, 2)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "split_endtoend_reads(data, sample)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "mapping_reads(data, sample, 2)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "build_ref_cigars(data, sample)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#samtools mpileup -uf ref.fa aln1.bam aln2.bam | bcftools view -bvcg - > var.raw.bcf\n", "! samtools mpileup -ur /home/deren/Dropbox/opbox/Maud/lgeorge.genome.fa " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "#regions = bedtools_merge(data, sample).strip().split(\"\\n\")\n", "fullregions = bedtools_merge(data, sample).strip().split(\"\\n\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "regions = (i.split(\"\\t\") for i in fullregions)\n", "regions = ((i, int(j), int(k)) for (i, j, k) in regions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "\n", "def get_ref_region(reference, contig, rstart, rend):\n", " \"returns the reference sequence over a given region\"\n", " cmd = [\n", " ip.bins.samtools, 'faidx', \n", " reference,\n", " \"{}:{}-{}\".format(contig, rstart + 1, rend),\n", " ]\n", " stdout, err = sps.Popen(cmd, stdout=sps.PIPE).communicate()\n", " name, seq = stdout.decode().split(\"\\n\", 1)\n", " listseq = [name, seq.replace(\"\\n\", \"\")]\n", " return listseq\n" ] }, { "cell_type": "code", "execution_count": 527, "metadata": {}, "outputs": [], "source": [ "def build_ref_cigars(data, sample):\n", " \n", " # get all regions with reads. Generator to yield (str, int, int)\n", " #fullregions = bedtools_merge(data, sample).strip().split(\"\\n\") \n", " regions = (i.split(\"\\t\") for i in fullregions)\n", " regions = ((i, int(j), int(k)) for (i, j, k) in regions)\n", "\n", " # access reads from bam file using pysam\n", " samfile = AlignmentFile(\n", " os.path.join(data.dirs.refmapping, \n", " \"{}-mapped-sorted.bam\".format(sample.name)),\n", " 'rb')\n", "\n", " # iterate over all regions\n", " out = open(\"test.clustS\", 'w')\n", " idx = 0\n", " clusters = []\n", " for reg in regions:\n", " ref = get_ref_region(data.paramsdict[\"reference_sequence\"], *reg)\n", " reads = samfile.fetch(*reg)\n", "\n", " # match paired reads together in a dictionary\n", " rdict = {}\n", " for read in reads:\n", " if read.qname not in rdict:\n", " rdict[read.qname] = [read, None]\n", " else:\n", " rdict[read.qname][1] = read\n", "\n", " # sort keys by derep number\n", " keys = sorted(\n", " rdict.keys(),\n", " key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n", "\n", " # build the cluster based on map positions, orientation, cigar\n", " clust = []\n", " for key in keys:\n", " r1, r2 = rdict[key]\n", " if r1 and r2:\n", "\n", " aref = np.array(list(ref[1]))\n", " arr1 = np.zeros(aref.size, dtype=\"U1\")\n", " arr2 = np.zeros(aref.size, dtype=\"U1\")\n", " arr1.fill(\"-\")\n", " arr2.fill(\"-\")\n", "\n", " try:\n", " # how far ahead of the start does this read begin\n", " seq = cigared(r1.seq, r1.cigar)\n", " start = r1.reference_start - reg[1] \n", " arr1[start:start + len(seq)] = list(seq)\n", "\n", " seq = cigared(r2.seq, r2.cigar)\n", " start = r2.reference_start - reg[1] \n", " arr2[start:start + len(seq)] = list(seq)\n", "\n", " arr3 = join_arrays(arr1, arr2)\n", " pairseq = \"\".join(arr3)\n", " derep = r1.qname.split(\"=\")[-1]\n", " clust.append(\"{}\\n{}\".format(\"{}:{}-{};size={}\"\n", " .format(*reg, derep), pairseq))\n", " except ValueError:\n", " print(reg)\n", " clusters.append(\"\\n\".join(clust))\n", " idx += 1\n", " if not idx % 1000:\n", " out.write(\"\\n//\\n//\\n\".join(clusters))\n", " out.close()\n" ] }, { "cell_type": "code", "execution_count": 549, "metadata": {}, "outputs": [], "source": [ "\n", "def build_ref_cigars(data, sample):\n", " \n", " # get all regions with reads. Generator to yield (str, int, int)\n", " #fullregions = bedtools_merge(data, sample).strip().split(\"\\n\") \n", " regions = (i.split(\"\\t\") for i in fullregions)\n", " regions = ((i, int(j), int(k)) for (i, j, k) in regions)\n", "\n", " # access reads from bam file using pysam\n", " samfile = AlignmentFile(\n", " os.path.join(data.dirs.refmapping, \n", " \"{}-mapped-sorted.bam\".format(sample.name)),\n", " 'rb')\n", "\n", " # iterate over all regions\n", " opath = os.path.join(\n", " data.dirs.refmapping, \"{}.clustS.gz\".format(sample.name))\n", " out = gzip.open(opath, 'w')\n", " idx = 0\n", " clusters = []\n", " for reg in regions:\n", " ref = get_ref_region(data.paramsdict[\"reference_sequence\"], *reg)\n", " reads = samfile.fetch(*reg)\n", "\n", " # match paired reads together in a dictionary\n", " rdict = {}\n", " for read in reads:\n", " if read.qname not in rdict:\n", " rdict[read.qname] = [read, None]\n", " else:\n", " rdict[read.qname][1] = read\n", "\n", " # sort keys by derep number\n", " keys = sorted(\n", " rdict.keys(),\n", " key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n", "\n", " # build the cluster based on map positions, orientation, cigar\n", " clust = []\n", " for key in keys:\n", " r1, r2 = rdict[key]\n", " if r1 and r2:\n", "\n", " aref = np.array(list(ref[1]))\n", " arr1 = np.zeros(aref.size, dtype=\"U1\")\n", " arr2 = np.zeros(aref.size, dtype=\"U1\")\n", " arr1.fill(\"-\")\n", " arr2.fill(\"-\")\n", "\n", " # how far ahead of the start does this read begin\n", " seq = cigared(r1.seq, r1.cigar)\n", " start = r1.reference_start - reg[1] \n", " arr1[start:start + len(seq)] = list(seq)\n", " \n", " seq = cigared(r2.seq, r2.cigar)\n", " start = r2.reference_start - reg[1] \n", " arr2[start:start + len(seq)] = list(seq)\n", " \n", " arr3 = join_arrays(arr1, arr2)\n", " pairseq = \"\".join(arr3)\n", "\n", " derep = r1.qname.split(\"=\")[-1]\n", " rname = \"{}:{}-{};size={}\".format(*reg, derep)\n", " clust.append(\"{}\\n{}\".format(rname, pairseq))\n", " clusters.append(\"\\n\".join(clust))\n", " idx += 1\n", " if not idx % 100:\n", " out.write(\"\\n//\\n//\\n\".join(clusters).encode())\n", " out.close()\n" ] }, { "cell_type": "code", "execution_count": 550, "metadata": {}, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbuild_ref_cigars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m\u001b[0m in \u001b[0;36mbuild_ref_cigars\u001b[0;34m(data, sample)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0mclusters\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mreg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mregions\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0mref\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_ref_region\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparamsdict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"reference_sequence\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mreg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 23\u001b[0m \u001b[0mreads\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msamfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfetch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mreg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36mget_ref_region\u001b[0;34m(reference, contig, rstart, rend)\u001b[0m\n\u001b[1;32m 2150\u001b[0m \"\"\"\n\u001b[1;32m 2151\u001b[0m \u001b[0minp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtmpdir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"{}_derep.fastq\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2152\u001b[0;31m \u001b[0mout1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtmpdir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"{}_R1-tmp.fastq\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2153\u001b[0m \u001b[0mout2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtmpdir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"{}_R2-tmp.fastq\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2154\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/lib/python3.6/subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors)\u001b[0m\n\u001b[1;32m 707\u001b[0m \u001b[0mc2pread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc2pwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 708\u001b[0m \u001b[0merrread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 709\u001b[0;31m restore_signals, start_new_session)\n\u001b[0m\u001b[1;32m 710\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 711\u001b[0m \u001b[0;31m# Cleanup if the child failed starting.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/lib/python3.6/subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)\u001b[0m\n\u001b[1;32m 1296\u001b[0m \u001b[0merrpipe_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbytearray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1297\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1298\u001b[0;31m \u001b[0mpart\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrpipe_read\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m50000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1299\u001b[0m \u001b[0merrpipe_data\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mpart\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1300\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mpart\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrpipe_data\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m50000\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "c = build_ref_cigars(data, sample)" ] }, { "cell_type": "code", "execution_count": 547, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "b'Contig0:3481-3748;size=3\\nCATGAGGTCCTTAGGGAGGGAATCAACGAAGTAGCCAAACACTTGCTGAATGTTGAAATGAGCCCCTACAG------------------------------------------------------------------------------------------------------------------------TTCCAGGGTCACACCAGCCAAGGCACTGTTGCAGTCACTTAACAAAGCAGTGTGATTCCCAGTTGGCCACAGAATT\\n//\\n//\\nContig0:4544-5088;size=1\\nCATGCTTGTTTAAAGTTGTGCAGTGCTCTTTTCTGGCAGGGGATTGGTCCTGCTTTGAGCAGGGGGTTGGA--------------------------------------------------------------------------------------------------------------------------------------TGGACTGGGAGCCCCTCTGTCAGCTCCCTGCTCCCCTAAGTTCCCTGTGCTGCAGTCGCCCAGCAGGCTATCAATT-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:4544-5088;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGCTGCTCCTGCCCTCTGCCTTGGAGCTGCTCCCAGAGACTCCTGCTTGCTGTGCAGGGAGGAAAGG------------------------------------------------------------------------------------GAGGGAGAGAGACAGAGAGAGCTTGGGGCAGCAGCTGCTGTCTCAACTTCCTGATCCACTGACAAACAATGCAATT\\nContig0:4544-5088;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGCTGCTCCTGCCCTCTGCCTTGGAGCTGCTCCCAGAGACTCCTGCTTGCTGCGCAGGGAGGAAAGG------------------------------------------------------------------------------------GAGGGAGAGAGACAGAGAGAGCTTGGGGCAGCAGCTGCTGTCTCAACTTCCTGATCCACTGACAAACAATGCAATT\\nContig0:4544-5088;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGGTGTCCCCCTCCTCCCTGCTCCTGCACCCTGCTTACCTCTTCTCCATATAGAGCAGGGAGGGGACAC-GAGGGAGAGAGACAGAGAGAGCTTGGGGCAGCAGCTGCTGTCTCAACTTCCTGATCCACTGACAAACAATGCAATT\\n//\\n//\\nContig0:7193-7952;size=1\\nAATTGATGGGACCCTGGAACATTTCTAGCTTGTACTGATTGTTTTGGGATTTTTTTGTTTGTTCTCTGGTTTCAATAGCAGTTGGGTGCCCAGATACCTGGGGGGAGCAGTTGGAGGGGGGTTCGCCTTCCTCTGCAGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:7193-7952;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------CATGGAGCACAGGACATTTGCAGGTTTAAACTAGTGTAAATGGTGAATCCTCTGTGACTTAAAGTCTTTAA----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GAAGCAGCAAATAGATGGGGTGAGGGCAGGTGAATGGAGCGACTCCAATCAGATCATGGGTTTTAGCAGACTAATT--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:7193-7952;size=1\\nn//\\n//\\nContig0:13327-13845;size=4\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGCTTTCTCATTCACACACTTTTCATAGGGTGCTACAGTCATGCATCCTGATCCATACTTCTGTTACCC---------------------------------------------------------------------------------------------CTGCCCCCTCCCGTGGACGGACTCCGGGTGGGTGTATCGCAGGCTGGGAGCATACTGAAGGTGGACACATG\\nContig0:13327-13845;size=1\\nCCAGCCAGGGCTTTGTCAAGCCTGACCTTAAAAACCTCTAAGGAAGGAGATTCCACCACCTCCCTAG----------------------------------------------------------------------------------------------------------------------TGAGAACAGTCTAGATCCAACCTCTTTGGAACCCCCTTTCAGGTAGTTCAAAGTAGCTATCAAAT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\n//\\n//\\nContig0:14393-14579;size=1\\nCATGCTGGGCCAGATCCACCCCCCTGGGGAAGCCCAGGGGATTGCACTAGGGATCAGGTTTGCCCCTT---------------------------------------------TTTAAATGAGTCCTTATTTGACTCTGGGCTCACCAGGCCACCTCCAAGCTGTTCCTTGGCTGGCTCCGTAATT\\n//\\n//\\nContig0:14907-15026;size=1\\nCATGATCTTGGACACATCTGTTCACCTCTCTTCTTCCCCTCCACGCCCCACCACCCAAAAAAATCTGTGAAATGGTAGGGTTGATGCTGCTTTACCTTCTGGGGGTGCTTGTGAGAATT\\n//\\n//\\nContig0:15377-15552;size=1\\nAATTGCCCTGCTTGCTACCCCTTAACACCAGCCCTGGGTTTTACATGCAGAAAACAGGTGTTGTGGCACAGG--------------------------------TGATCTCGGCAGGCTGCTGTTGGTCCATGTGCCATGGTCCAGCAGCACTCCTGGACTTGCAAACATTCATG\\n//\\n//\\nContig0:16736-16933;size=1\\nAAATTCTCAGGGCTTTGCTATGCAGGAGATCAGACTAGATTA------------------------------------------------------------------------------------GCAGGAAAATCTGTTACCTTTGTGTTTTCCCTGGGAGAGGTGGGGGCGCCGCTGGCTCCGACTCCGGCATG\\n//\\n//\\nContig0:19843-20061;size=1\\nCATGTGACTTTCCCTGACGGGGAAAACCACTGTGACCTGACCAGAGGGCCAAGCCATGAAACGGGAGCAGC-----------------------------------------------------------------------CAGCCAGAAGGAGGTGCTCCAGCTCTGAGTAGAGCTCCTCCAAGGATGAATAGAGATCTGGACTGGAAGCTAAATT\\nContig0:19843-20061;size=1\\nCATGTGACTTTCCCTGACGGGGAAAACCACTGTGACCTGACCAGAGGGCCAAGCCATGAAACGGG-----------------------------------------------------------------------------CAGCCAGAAGGAGGTGCTCCAGCTCTGAGTAGAGCTCCTCCATGGATGACTAGAGATCTGGACTGGAAGCTAAATT\\nContig0:19843-20061;size=1\\nCATGTGACTTTCCCTGACGGGGAAAACCACTGTGACCTGACCAGAGGGCCAAGCCATGAAACGGGAGCAGC-----------------------------------------------------------------------CAGCCAGAAGGAGGTGCTCCAGCTCTGAGTAGAGCTCCTCCATGGATGACTAGAGATCTGGACTGGAAGCTAAATT\\n//\\n//\\nContig0:22414-22496;size=1\\nAATTCTCGATCATCACGTGCACTGGGTGTGGCCGGATTTCCGAGATGGATGGAACTGGGTGAGCCAAATGAATCCCTCCATG\\n//\\n//\\nContig0:22928-23117;size=1\\nCATGGGCCAGGATTCCAAAGGTTAACACCCCGGCAGCAGAGGTGGGGAGCTGAGAGCACTCAGCACCTCGA--------------------------------------------GGACTGCACTCTATATCCACCCTTGAATAGGAACCTACATGAAATGGAGTCGCTCTCCAGATGTTTGGGGAATT\\n//\\n//\\nContig0:23425-23627;size=1\\nCATGCATAAGGTCTCTAAATCCCACCACTGTTCTGATCTTTGTTTTGTAAAGTGCCCATGAGCGTAGCATC-------------------------------------------------------------------------------------GGAGAGGGTACGCTGTATTCTTCAGACATAACGGCTCGATTAATT-\\nContig0:23425-23627;size=1\\nCATGCATAAGTTCTCTAAATCCCACCACTGTTCTGATCTTTGTTTTGTAAAGTGCCCATGAGCGTAGCATC------------------------------------------------------------------------------------------GGGTACGCTGTATTCTTCAGACATAACGGCTCGATTAATTA\\nContig0:23425-23627;size=1\\n--------------------------------------------------------CATGAGCGTAGCATCAGGCCCAAATGACAGGGCTGATATGCCACACACTGGTAAACTCTGTCCCATTTCTCATTCACTCTGCATTCAGTGGTGTAATGTGGGAGAGGGTACGCTGTATTCTTCAGACATAACGGCTCGATTAATT-\\n//\\n//\\nContig0:24391-24908;size=3\\n--------------------------AATTATTATGTACATAGTTTCGTCCTATTCAGTGTCTACTCAGCGCTTCTCGGCTTGTCTCTTGTATTCATTAAAT-------------------------------------------------------------------------------------------------------------------CGCTCACTGCTCAGCAGTTCGATGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCT----------------------------------------------------------------------TGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCGCC--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCT------------------------------------------------------------------------------------------------------------------------------------------------CGCTCACTACTCCACAGTTCAGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATCTATCTTTAGTGACATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACGTAGTTTAGTCCTATTCAATGTCTACTCAGCGCT----------------------------------------------------------------------TGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCAATGTTGCAATGTCCTGGTGAAATCGCTCGCCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCT--------------------------------------------------------------------------GGGCTCCATTTGCCCTGATAGTGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCACCGTGCTC--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCT---------------------------------------------------------------------TTGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTCCTGCAATGTCCTGGTGAAATCGCACGCCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\n-----------------------------------------AATTTAGTCCTATTCGGGGTCTACTCAGCGCTTCTTGGCTTGTCTCTTGTATTCATTAAATGGAGCATCTCTTGTC----------------------------------------------------------------------------------------------------CGCTCACCGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGCCGTCTTTTCCTTGCCATGCAGTGCATGGTCAAATGCATCGTCTCGAAGAAGTTCACCAATCTGAGGACCAACAAAGACACCTTCCTTTGTCTTAGCTTCACTTAACCTTGGAAATT\\n//\\n//\\nContig0:25290-25837;size=1\\nAATTGATGGCAAAACATTGCCATTATGCAGCAAAACAACTTTAAGACTCGTCTTCGATGAATCAATGAACAGTCTC----------------------------------------------------TGTTGCAGGCTACTAGATCACCTTCCATGAAGAAGAATGGGACAAGATCCTTTTGATGGTCACGGAACATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:25290-25837;size=1\\nAATTGATGGCAAAACATTGCCATTATGCAGCAAAACAGCTTTAAGACTTGTCTTCGATGAATCAATGAACAGTCTC----------------------------------------------------TATTGCAGGCTCCAAGATCACCTTCCATGAAGAAGAATGGGACAAGATCCTTTTGATGGTCACGGAACATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:25290-25837;size=1\\n----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTAATAAAGTTTGACTACATTTATTTCAGAAGCATTTTGGCTGTAGAGCAGCGAAATCAAAGCTGTGCCGATGC------------------------------------------------------------------------------------------AAGCAAATAGAGACATTTTCAGGTCTGCTGGCTGGTTAGGCTCCATAGTCCTACGCAAGGATGTGATCATG\\n//\\n//\\nContig0:28260-28458;size=1\\nCATGCGTCTGCAGAAAAGTACGGAGACTGGGCAAAGGCTGTGAGAAAGAAGAACGTTTGGGGCTTGTCCAA---------------------------------------------------ATCCAGTTCCTCTGCCTACTCTGGGACATTTATATCACAGAGTAGGCACTGTAATAAAAAGCAAGCAATGAGAATT\\n//\\n//\\nContig0:28945-29088;size=1\\nAATTCAAGAACTCAAAGGGACACTATCACCTTAAACTAGCAGGCACACACAAGAGGCAGTAGCTAGGTTGATGGGAGGATGCTCAGAGGTGTGGATTCTTTCATCGATCTAGCAGTGCCCACACTAGGGCTTAGGGTGGCATG\\n//\\n//\\nContig0:30099-30713;size=7\\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCAACTAGTCCAGATGGCTGCATTTCCCCATACACCCACTACTCCTGTTTGGAAGCCTCTCCATCACTTCCT---------------------------------------------------------------------------------------------------------------------------TGGTAGGGGCTGAAGGCTCTGGTCTGGGGTGGGCTTGATCTCTTGTACAGAAAACAGAGCTGAACCACATG\\nContig0:30099-30713;size=1\\nAATTAATGCTAGTCTTAATGAGAGTCTATTTCAACTCTTATGTAGCTGACAGGCTTTCTCAGGGCATGGCTGCTGT-------------------------------------------------------------------------------------------------TAGTTCCTTGAAGAATCAGGTTTTCCCTATTTTTTTTTAATCTCCTGTGTCAACTGGCGTCTTGGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:30099-30713;size=1\\nAATTAATGATAGTCTTAATGAGAGTCTATTTCAACTCTTATGTAGCTGACAGGCTTTCTCAGGGCATGGCTGCTGT------------------------------------------------------------------------------------------------TTAGTTCCTTGAAGAATCAGGTTTTCCCTATTTTTTTTTAATCTCCTGTGTCAACTGGCGTCTTGGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:30099-30713;size=1\\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCAACTAGTCCAGATGGCTGCAATTCCCCATACACCCACTACTCCTGTTTGGAAGCCTCTCCATCACTTCCT---------------------------------------------------------------------------------------------------------------------------TGGTAGGGGCTGAAGGCTCTGGTCTGGGGTGGGCTTGATCTCTTGTACAGAAAACAGAGCTGAACCACATG\\nContig0:30099-30713;size=1\\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCAACTAGTCCAGATGGCTGCATTTCCCCATACACCCACTACTCCTGTTTGGAAGCCTCTCCATCACTTCCT---------------------------------------------------------------------------------------------------------------------------------------------------TGGGGTGGGCTTGACCTCTTGTACAGAAAACAGAGCTGA--------\\n//\\n//\\nContig0:32872-33505;size=1\\nAATTCCTCTCCCTCCCTAACGTTAATCCCCCTGATATATTTATATAGAGCAAGCA---------------------------------------------------------------------------------CTCGGATCATCCTAGTAGCCCGTCTCTGAACCTGTTCCAGTTTGAATTC----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:32872-33505;size=1\\nn//\\n//\\nContig0:33785-34014;size=3\\nAATTAGTGCTAGGAGTCATTTTATTTAGCACCGTCATTAATGATTTAGGAAGGTGGAGCGTTAGGAGCCCATTAAT----------------------------------------------------------------------------------ATAGGCCTTCCCAAGCTTTGGCAACCACCTATTCCCATAGGTGCTACAGGGGGCAGAGCTCCGTGTGCATG\\nContig0:33785-34014;size=1\\nAATTAGTGCTAGGAGTCATTTTCTTTAGCAGCGTCATTAATGATTTAGGAAGGTG------------------------------------------------------------------------------------------------------------CCTTCCCAAGCTTTGGCAACCACCTATTCCCATAGGTGCTACAGGGGGCAGAGCTCCGTGTGCATG\\n//\\n//\\nContig0:36273-36404;size=1\\nAATTGCCCCACTTGCCTCCTTGCCGGGAGGCCCTGAAACACCCCCAGGGAAAAATAAAACTCAGCGCCTGTGATATGCAGTCTTAGATATAGAGGCAGGCTGAGGAGGGGCTGGGTGTGAGGGGGAGCATG\\n//\\n//\\nContig0:37976-38353;size=5\\nAATTAAATGCTACAGTGTTGAGCTGTTTTTTGAGGAAGAGCCTGTTCCTTGAAAGCAACTGGCCTTTGCTTTTCTT----------------------------------------------------------------------------------------------------------------------------GAGGCTGTGGGATCGGCAGCTGGAGTTCCCAGGGCTGCTTGCTGAACTTGTTTATTGTCGATTTTCCCATG----------------------------------------------------------------------------------------------------------\\nContig0:37976-38353;size=1\\n---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTTAAGTATCCTCACACCTCTTGTCTACTGTCTGAAATGGGCCGTCTTGNTTATCACTACAAAAGTTTTTTTTTTCTCCTGCTGATAATAGCTCATCTTAATTAATT\\n//\\n//\\nContig0:39644-39906;size=1\\nAATTTGGACATCACGATTCTCAGGCGGCAGGTTCATGCCATGGAACACCGATTCTGGGCCTTGGAAAC---------------------------------------------------------------------------------------------------------------------------GTGCAAGAATACCAAGATGAGAGCAGCCCTCACAGTTGAGAAGCAAGTGACGATAGCCCTGTGGAGGCATG\\n//\\n//\\nContig0:40396-40633;size=11\\nCATGCACAGGCAGCTTGGACAGTAGTCAGGAGCTGTTCAACTATAGACTGAGCAAGTGCAGAATGGTGGTA------------------------------------------------------------------------------------------GCTTGTTGTGTGCTCCACAATATCTGTGAGACTAAGGGGGAGATGTTTATGGCGGAGTGGGAGATTGAGGCAAATT\\nContig0:40396-40633;size=1\\nCATGCACAGGCAGCTTGGACAGTAGTCAGGAGCTGTTCAACTATAGGCTGAGCAAGTGCAGAATGGTGGTA------------------------------------------------------------------------------------------GCTTGTTGTGTGCTCCACAATATCTGTGAGACTAAGGGGGAGATGTTTATGGCGGAGTGGGAGATTGAGGCAAATT\\nContig0:40396-40633;size=1\\nCATGCACAGGCAGCTTGGACAGTAGTCAGGAGCTGTTCAACTATAGACTGAGCAAGTGCAGAATGGTG------------------------------------------------------------------------------------------------------------------------GAGACTAAGGGGGAGATGTTTATGGCGGAGTGGGAGATTGAGGCAAATT\\nContig0:40396-40633;size=1\\nCATGCACAGGCAGCTTGGACAGTAGTCAGGAGCTGTTCAACTATAGACTGAGCAAGTGCAGAATG------------------------------------------------------------------------------------------------GCTTGTTGTGTGCTCCACAATATCTGTGAGACTAAGGGGGAGATGTTTATGGCGGAGTGGGAGATTGAGGCAAATT\\n//\\n//\\nContig0:41620-42163;size=14\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGGCTAACAATAGGGATGATTTCTGTTCAGCCAAAGGTAAACAGCCCAGCAGGAACGGCCATCTCTG------------------------------------------------------------------------------------------------CATCCCCATACACATTAATAGACTTTTCCAGTAGCTGTACTGTCTGCCAATGCATCCCAAGTCTTCAGGGCAAATT\\nContig0:41620-42163;size=1\\nCATGGCATGTGTGCTTTCTTTACAAGATCGCATTTTGCCTCTTATATTGAGGGCCTGCTGGTTTGGCGTGAGAGATCACACACGCAGGGCTGGTGGGCAACAGAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:41620-42163;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGGCTAACAATAGGGATGATTTCTGTTCAGCCAAAGGTAAACAGCCCAGCAGGAACGGCCATCTCTG--------------------------------------------------------------------------------------------------TCCCCATACACATTAATAGACTTTTCCAGTAGCTGTACTGTCTGCCAATGCATCCCAAGTCTTCAGGGCAAATT\\nContig0:41620-42163;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGGCTAACAATAGGGATGATTTCTGTTCAGCCAAAGGTAAACAGCCCAGCAGGAACGGCCATCTC----------------------------------------------------------------------------------------------------TCCCCATACACATTAATAGACTTTTCCAGTAGCTGTACTGTCTGCCAATGCATCCCAAGTCTTCAGGGCAAATT\\nContig0:41620-42163;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGGCTAACAATAGGGATGATTTCTGTTCAGCCAAAGGTAAACAGCCCAGCAGGAACGGCCATCTCTG------------------------------------------------------------------------------------------------CATCCCCATACACATTAATGGACTTTTCCAGTAGCTGTACTGTCTGCCAATGCATCCCAAGTCTTCAGGGCAAATT\\n//\\n//\\nContig0:42764-42845;size=1\\nCATGGTCACCTGTGCTGATGAGCTCTGCATGGTCACCTGTGCTGATCAGCTTGCCACGCTGGTCAAACAGGAAATCAAATT\\n//\\n//\\nContig0:44568-44784;size=1\\nAATTTCCACAGGGCGGGAAAACCATTTTGTGCCCAGCACCCATTCAGGCAGCTGGTCACTGGGGTTGTCTGTGATG---------------------------------------------------------------------CCCTTTGATGACCATCCCCGCTGCTTAAGACCCAAAAGCAGAGCCTCGACCGCTGATGTTTCATGCACATG\\n//\\n//\\nContig0:48224-48506;size=2\\n-------------------------------------------------------------------CATGTCCAAATAATGTTGGACAACATAAGAAGGGCATTTTATATAAACAAGCAGGGGGGAGTGAGATCTCT--------------------------------------------------------------------ATATTTACCAGGGTTGCAAAACTCCCTAGCAGAGAGCCTGAGTGGATGTTTTTCCACAAATAGGAGATTCACAATT\\nContig0:48224-48506;size=1\\nTAATTTACTAGAACTCCAGGTGGTTCACTGAGCCTGCATCTCCTTCCCCCTGCTCATCCAATCCAGACATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\n//\\n//\\nContig0:49781-50026;size=1\\nAATTTGCTTTTGCGGGCCAGTCCCATCTGAGGGCTACTCAGTTCCTCTTATGTGGAAGGGGAAGTAGTGGTCTTGA--------------------------------------------------------------------------------------------------GGACACCGTACTGATGTGGGAGAGAAAAGAGAGCAAGCTTAGATTGCATGCACTTTGCAGCAGCACCCATG\\nContig0:49781-50026;size=1\\nAATTTGCTTTTGCGGGCCAGTCCCATCTGAGGGCTACTCAGTT--------------------------------------------------------------------------------------------------------------CGGCTTCGTGTTGAAGCACTCGGACACCGTACTGATGTGGGAGAGAAAAGAGAGCAAGCTTAGATTGCATG---------------------\\nContig0:49781-50026;size=1\\nAATTTGCTTTTGCGGGCCAGTCCCATCTAAGGGCTACTCAGTTCCTCTTATGTGGAAGGGGAAGTA------------------------------------------------------------------------------------------------------------------CGTACTGATGTGGGAGAGAAAAGAGAGCAAGCTTAGATTGCATGCACTTTGCAGCAGCACCCATG\\nContig0:49781-50026;size=1\\nAATTTGCTTTTGCGGGCCAGTCCCATCTGAGGGCTACTCAGTTCCTCTTATGTGGAAGGGGAAGTAGTGGTCTTGA-----------------------------------------------------------------------------CGGCTTCGTGTTGAAGCACTCGGACACCGTACTGATGTGGGAGAGAAAAGAGAGCAAGCTTAGATTGCATG---------------------\\n//\\n//\\nContig0:51214-51341;size=1\\nCATGCCAGGTCCCACTGGCGGAGGCAGGGTCACGCCCAGGCCATCCCTGGAAGTACCAGAATGTCCAGTATTCTGGGGATGCATTAGGGGTCACTTTACTCAAAAGTGGCAAAGCCTCTAGCGAATT\\n//\\n//\\nContig0:53474-53686;size=1\\nCATGTTAACCAGGCCGGGTGCCCCAGGCCTCAGGCATCCTGCCTCTTAACCAGGCCCAGATGCTCTTCTTT--------------------------------------------------------------------------------------------------AACTCGCTGTGTGACTTTGATCAAGTCACTTTGCCTTTCCATG\\nContig0:53474-53686;size=1\\nCATGTTAACCAGGCCGGGTGCCCCAGGCCTCAGGCATCCTGC---------------------------------------------------------------------------------------------------GAACATTTGGTTTCTCTTCTGCGCCAATAACTCGCTGTGTGACTTTGATCAAGTCACTTTGCCTTTCCATG\\n//\\n//\\nContig0:53920-54018;size=1\\nCATGATATGCAGTGGAGGGGAGAGGAACCCCTGGGCCTGGCAGTTCAGAGCCCTGGCACCTCTGGGCTTGCTGCAGCAGTTACAAATGTAAAAAAATT\\n//\\n//\\nContig0:56088-56333;size=10\\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCAAGAGATAGCCGAGATTGGAATCTG-------------------------------------------------------------------------------------------------TAATAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\\nContig0:56088-56333;size=1\\nTCATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCA-------------------------------------------------------------------------------------------------------------------------TAATAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\\nContig0:56088-56333;size=1\\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCAAGAGATAGCCGAGATTGGAATCTG----------------------------------------------------------------------------------------------------TAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\\nContig0:56088-56333;size=1\\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCAAGAGATAGCCGAGATTGGAATCTG--------------------------------------------------------------------------------------------------AATAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\\nContig0:56088-56333;size=1\\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGACTTAAATCAAGAGATAGCCGAGATTGGAATCTG-------------------------------------------------------------------------------------------------TAATAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\\nContig0:56088-56333;size=1\\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCAAGAGATAGCCGAGATTGGAATCTG--------------------------------------------------------------------------------------------------------------AGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\\n//\\n//\\nContig0:59313-59545;size=1\\nCATGGAGGGTGGGGGCTTCTGGTTGTAACTGGGGATGATCCCTCTGACCAGTTCCATGCCACTGTGG-----------------------------------------------------------------------------------------GCTTCCCCTGGTGTCAGGGCAAGCCCCCGAGAGGAACAATGACAAAGCATCACATACCGAGGTGGAATATTTAATT\\n//\\n//\\nContig0:62283-62553;size=3\\n-AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAGAGGT----------------------------------------------------------------------AGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG----------------------------------------------------\\nContig0:62283-62553;size=1\\nAAATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAA---------------------------------------------------------------------------------------------------AGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG----------------------------------------------------\\nContig0:62283-62553;size=1\\nAAATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAAC--------------------------------------------------------------------------------------------------AGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG----------------------------------------------------\\nContig0:62283-62553;size=1\\n-AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAGAGGT--------------------------------------------------------------------------------------------------------------------------GAGGACAGTGCTGACCATGCACATACTAACTAGTATGTTATAATGCCCTTATTTTGGGCTTTTCCTCCATG\\n//\\n//\\nContig0:63207-63392;size=1\\nCATGTTCAATGGTTTCACTTCAACGCAAACAGCCGTGGCTTCGATTGTGAACAGAAAATGTTGTGGCACAG--------------------------------------TGCCTCGCTGGTCTCTGAGGGCCCTGAAGAGCTTGCACGCCAAAGGACCAGATGGCTGGTGGGTTAAATGGAAATT\\nContig0:63207-63392;size=1\\nCATGTTCAATGGTTTCACTTCAACGCAAACAGCCGCGGCTTCGATTGTGAACAGAAAATGTTGTGGCACAG--------------------------------------TGCCTCGCTGGTCTCTGAGGGCCCTGAAGAGCTTGCACGCCAAAGGACCAGATGGCTGGTGGGTTAAATGGAAATT\\n//\\n//\\nContig0:64734-65000;size=12\\nAATTTGCTGCTAGGCCCAGACTAAAAGCAGGTAGCCACTGCGAATGCACAAGAGAGCCCAATGTAGCCCTGGAGAG-----------------------------------------------------------------------------------------------------------------------CTGTTCTCTGGCTTTGAACGACTGTGTGCTTTGGTGAAATACCGCTCAGAGCCAATGGGGTGGCATTCATG\\nContig0:64734-65000;size=1\\nAATTTGCTGCTAGGCCCAGACTAAAAGCAGGTAGCCACTGCGAATGCACAAGAGAGCCCAATGTAGCCCTGG---------------------------------------------------------------------------------------------------------------------------CTGTTCTCTGGCTTTGAACGACTGTGTGCTTTGGTGAAATACCGCTCAGAGCCAATGGGGTGGCATTCATG\\nContig0:64734-65000;size=1\\nAATTTGCTGCTAGGCCCAGACTAAAAGCAGGTAGCCACTGCGAATGCACAAGAGAGCCCAATGTAGCCCTGGAGAG-----------------------------------------------------------------------------------------------------------------------CTGTTCTCTGGCTTTGAACGACTGTGTGCTTTGGTGAAATGCCGCTCAGAGCCAATGGGGTGGCATTCATG\\nContig0:64734-65000;size=1\\nAATTTGCTGCTAGGCCCAGACTAAAAGCAGGGAGCCACTGCGAATGCACAAGAGAGCCCAATGT-----------------------------------------------------------------------------------------------------------------------------------CTGTTCTCTGGCTTTGAACGACTGTGTGCTTTGGTGAAATACCGCTCAGAGCCAATGGGGTGGCATTCATG\\n//\\n//\\nContig0:65302-65475;size=1\\nAATTAAAAACGAACCATAAACAACAACTGAGAAAGGATTCCAGATCGTCTTTTAAAAAAGGAAGCGGGTTCTCCGG--------------------------GAAAAATGCACAGTTGATAGGAAAGAGTGTTAGCTCCATCGTGAGCCCCTGACCCACAAGGGCTGTGCATG\\n//\\n//\\nContig0:67691-68021;size=1\\nCATGATCTTATGGTCACTCTACCATTCTCCCATGCACCTGGAAGAGAAAGACAGAAGCTGTGTTAACCGAC-------------------------------------------------------------------------------------------------TTAGCACCTTTCGCTCAAGGGCTTCTAATGCTCCCCTCACCTCCCCTCAATAATGATACTTAGCACTTGATGAATT--------------------------------------------------------------------------------------\\nContig0:67691-68021;size=1\\n------------------------------CATGCACCTGGAAGAGAAAGACAGAAGCTGTGTTAACCGACTGAACCCATGAGATGCTAACGAGGTCTCTC-------------------------------------------------------------------TTAGCACCTTTCGCTCAAGGGCTTCTAATGCTCCCCTCACCTCCCCTCAATAATGATACTTAGCACTTGATGAATT--------------------------------------------------------------------------------------\\nContig0:67691-68021;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTAAATAAGCCTTACCTGCTGTGAAATGTCAATACTATTTTGAAGATGGGGAAACCGAGGCATG\\n//\\n//\\nContig0:68467-69007;size=9\\n----------------------------------------------------------------------------------------------------------------------------------AATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTGGTCCA------------------------------------------------------------------------------------------------TCCCCGGTCCCGCAGAGTCGGTGGCTTGCCCGTGGGAGGTCCCTGGTCCTGTGGATTCGGTGGCTTGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:68467-69007;size=1\\nAATTTTACCTGAAGTTCATTTCAGAAGGGGAAGCTAAACTGGGGCAAGTGGGTTTAAACTCATGTGACGGTTCCCACACACAAAGTTGCACCAGGGTACCCATG----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:68467-69007;size=1\\n----------------------------------------------------------------------------------------------------------------------------------AATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTGGTCCA-------------------------------------------------------------------------------------------------CCCCGGTCCCGCAGAGTCGGTGGCTTGCCCGTGGGAGGTCCCTGGTCCTGTGGATTCGGTGGCTTGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:68467-69007;size=1\\n----------------------------------------------------------------------------------------------------------------------------------AATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTG-----------------------------------------------------------------------------------------------------TCCCCGGTCCCGCAGAGTCGGTGGCTTGCCCGTGGGAGGTCCCTGGTCCTGTGGATTCGGTGGCTTGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:68467-69007;size=1\\n----------------------------------------------------------------------------------------------------------------------------------AATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTGGTCCA------------------------------------------------------------------------------------------------TCCCCGGTCCCGCAGAGTCGGTGGCTTGCCCGTGGGAGGTTCCTGGTCCTGTGGATTCGGTGGCTTGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:68467-69007;size=1\\nn//\\n//\\nContig0:69863-70116;size=19\\nAATTTGCTGGGTCCCTGGCCGGGGCAGGCCTGTAACTCAGCAGTCGGTTGTCTCATAGAATCTCAGGGTTGGAAGG----------------------------------------------------------------------------------------------------------TGGGTTTAGCAGGCTAATGCTCAGATCACTGAGCTATCCCTCCCCCCCACTTCACTACCCACTAGGCCATG\\nContig0:69863-70116;size=1\\nAATTTGCTGGGTCCCTGGCCGGGGCAGGCCTGTAACTCAGCAGTTGGTTGTCTCATAGAATCTCAGGGTTGGAAGG------------------------------------------------------------------------------------------------------------GGTTTAGCAGGCTAATGCTCAGATCACTGAGCTATCCCTCCCCCCCACTTCACTACCCACTAGGCCATG\\nContig0:69863-70116;size=1\\nAATTTGCTGGGTCCCTGGCCGGGGCAGGCCTGTAACTCAGCAGCCGGTTGTCTCATAGAATCTCA-----------------------------------------------------------------------------------------------------------------------GGTTTAGCAGGCTAATGCTCAGATCACTGAGCTATCCCTCCCCCCCACTTCACTACCCACTAGGCCATG\\n//\\n//\\nContig0:71142-71672;size=1\\nAATTAGGCTTGAATAAAAACTGGGAGTGGATGGGCCATTACACAAAGTAAAACTATTTCCCCATGTTTATTTTCCC-----------------------------------------------------------------------------CCTGCTGGTAATAGCTCACCTTAACTGATCACTCTCGTTAGAGTGTGGATGGTAACACCCATTGTTTCATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:71142-71672;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGTTAGTCTCTAAGGTGCCACAAGGACTCCTGTTCTTTTTGAGGATACAGACTAGCATGGCTGCTACTCTGA------------------------------------------------------GTGCTGAGAGTTCGATGACTCTGGTAGCTGGGGGGATGTTCACTGGGACCCCCCCGCTCCCAGGAGACATG\\n//\\n//\\nContig0:73757-73975;size=1\\nCATGGTCTGTTTGCTACCTTGGGGCTGGCCTCGCTGGATGGT---------------------------------------------------------------------------------------------------------GGTGCTCCAGAGACTGGTGCTGAGGATGCAGCAGGGGGTGCTCTCCCTTCTGCATCAGAACGTATCCCATG\\n//\\n//\\nContig0:74377-74795;size=1\\nAATTGGTATAAGAATCCAAGGATGGTAACAAGAGCCGAAAATGCCACCCAAGGACTTCGATTCACTGACCTTGTGG-------------GCTGGAAAATGGCAGACACACTTGTTCTTTCTTGTTAGCCTCTGGAGACGCTGATGGGGCCTGAGGACATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:74377-74795;size=1\\n----------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCAGGTCAAGGAGTTAGCACCGGACTGGCAGCCCATGACTGTCCCCTGATACCATCACAGGGCTGG----------------------------------------------------------------------------------------------------------------------TGGGCACCCACACAGATCTGCTGATGCCCGTCACTCCTCACCGACGGCCACCTCGAGGGCAGATTAACATG\\n//\\n//\\nContig0:75032-75494;size=2\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG---------------------------------------------------------------------------------AAAGCCCCTTGCTCCTGCTACCCAATCGCCTGCCACCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG--------------------------------------------------------------------------------------------------------------------CCCCTGCCACATTCAGAATCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG-------------------------------------------------------------------------------------CCCCTTGCTCCTGCTACCCAACCGCCTGCCCCCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG--------------------------------------------------------------------------------------------------------------------CCCCTGCCCCATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG-------------------------------------------------------------------------------------------------------------CCTGCCTCCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG--------------------------------------------------------------------------------------CCCTTGCTCCTGCTACACAATCGCCTGCCACCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG---------------------------------------------------------------------------------AAAGCCCCTTGCTCCTGCTCCCCAATCGCCTGCCACCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGGAGATTGAGGGTTTTGAATATACAACTTCAGACTGTTAAGTGGGGTTATTGTAGCTTCCCTGCTCAATAGCTTCATACAGAATT\\n//\\n//\\nContig0:76480-77015;size=6\\n----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGACACACAGCTGTGTCTTTCTGATTCCATCCAATAGGGGGCAGCAGAGCATATAAACATTAACCAGTT------------------------------------------------------------------------------------ATATGCAGTTTTTCTTGTTCCCCTCCAGAGGGGTCAGTAATAAAACAATGGATATCATGGGATTAGAGCTGAAATT\\nContig0:76480-77015;size=4\\nCATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCGGCCATCTATCTACAATATGTATCTAAGC------------------------------------------------------------------------------------CTACACCTTCTTTTAACAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:76480-77015;size=2\\nCATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATC-GCCATCTATCTACAATATGTATCTAAGCA-----------------------------------------------------------------------------------CTACACCTTCTTTTAACAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:76480-77015;size=1\\nCATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATC-GCCATCTATCTACAATATGTATCTAAGCA-----------------------------------------------------------------------------------CTACACCTTCTTTTAACAGATCCCTGGGGACCCGGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:76480-77015;size=1\\n----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGAGAATCTCTATAAGCACTATAGGGCTCATGACACACAGCTGTGTCTTTCTGATTCCATCCAATAGGG------------------------------------------------------------------------------------------------------------------ATATGCAGTTTTTCTTGTTCCCCTCCAGAGGGGTCAGTAATAAAACAATGGATATCATGGGATTAGAGCTGAAATT\\nContig0:76480-77015;size=1\\n----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGACACACAGCTGTGTCTTTCTGATTCCATCCAATAGGGGGCAGCAGAGCATATAAACATTA-------------------------------------------------------------------------------------------------AGTTTTTCTTGTTCCCCTCCAGAGGGGTCAGTAATAAAACAATGGATATCATGGGATTAGAGCTGAAATT\\n//\\n//\\nContig0:79124-79379;size=1\\nCATGGACATAAGCCTTACGCCTCTCCTGGAAGTGGAGTTATGATGTCCGTGCAGTGGGGCGCTTACCTCG-------------------------------------------------------------------------------------------------------------GGTGCTGGGCAAATGGTCGAAGACTTGTTCCAACCTCTCCACTCTTGAGACCCAAACTTTGGGTCTGGTGGGAATT\\nContig0:79124-79379;size=1\\nCATGGACATAAGCCTTACGCCTCTCCTGGAAGTGGAGTTATGATGTCCGTGCAGTGGGGCGCGTACCTCGA--------------------------------------------------------------------------------------------------------------TGCTGGGCAAATGGTCGAAGACTGGTTCCAACCTCTCCACGCTTGAGACCCAAACTTTGGGTCTGGTGGGAATT\\nContig0:79124-79379;size=1\\nCATGGACATAAGCCTTACGCCTCTCCTGGAAGTGGAGTTATGATGTCCGTGCAGTGGGGCGCTTACCTCGA---------------------------------------------------------------------------------------------------------------------------------------------CCTCTCCACTCTTGAGACCCAAACTTTGGGTCTGGTGGGAATT\\n//\\n//\\nContig0:79959-80151;size=1\\nAATTTTTAAGCAACAGGATTTACTACCCGCTGTGGGAGGTGACAGGCTGATGGCTGTCTGGGAGACCCTCTCTGTC---------------------------------------------GTAGCCTGAACAGGCAGGATCCCACTAGTTGGCCATCTCCTCCATCCTCCCGCTAGCCACAGGTAGACATG\\nContig0:79959-80151;size=1\\nAATTTTTAAGCAACAGGATTTACTACCCGCTGTGGGAGGTGACAGGCTGATGGCTGTCTGGGAGACCCTCTCTGTC---------------------------------------------GTAGGCTGAACAGGCAGGATCCCACTAGTTGGCCATCTCCTCCATCCTCCCGCTAGCCACAGGTAGACATG\\n//\\n//\\nContig0:81339-81852;size=11\\nCATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGAC--------------------------------------------------------------------------------------------------TTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:81339-81852;size=6\\nCATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGAC--------------------------------------------------------------------------------------------------TTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGCTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:81339-81852;size=4\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGAAAAGTCAGGAAAAGAGCAGAGGCCGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCC-----------------------------------------------------------------------------------------------------------------------------GCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG\\nContig0:81339-81852;size=1\\nCATGCGTGGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGG------------------------------------------------------------------------------------------------------TATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:81339-81852;size=1\\nCATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGAT--------------------------------------------------------------------------------------------------------------------------TAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:81339-81852;size=1\\n-ATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGG----------------------------------------------------------------------------------------------------TTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:81339-81852;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGAAAAGTCAGGAAAAGAGCAGAGGCCGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTT-------------------------------------------------------------------------------------------------------------------------------------GCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG\\n//\\n//\\nContig0:82206-82699;size=4\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGA----------------------------------------------------------------------------------------GGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\\nContig0:82206-82699;size=1\\nCATGCAATGCAACAAGGTCTGGAGCTCCTTAGCGAGCCTTCGAGCCACCCAGTCCCTGAAATACACCCCCTGGTCCCTTTCAGGCTGGATCCAGAATGGAAAGTGTAACACCAATT-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:82206-82699;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTATCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGA----------------------------------------------------------------------------------------GGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\\nContig0:82206-82699;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGG-----------------------------------------------------------------------------------------GGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\\nContig0:82206-82699;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTTTCCCATACCTCCCGCCCCGCTCCTTTCCTCGCTTGATTTCTTCTTGAGGGAGGCAG-------------------------------------------------------------------------------------------TGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\\nContig0:82206-82699;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATATCTTCTTGAG---------------------------------------------------------------------------------------------------TGGCTCCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\\n//\\n//\\nContig0:83146-83367;size=1\\nAATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCAGTTTTGACGAACTGGTATTTGTTGCTG-----------------------------------------------------------------------------CCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATG\\nContig0:83146-83367;size=1\\nAATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCAGTTTTGACGAACTGGTATTTGTTGCTG--------------------------------------------------------------------------CACCCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATG\\n//\\n//\\nContig0:86329-86448;size=1\\nAATTTGGGGGCACTGCTTTTTGGTGCCCCCAAATCTCGGTGCCACCGCCTAGTTCACCTAGTGGTTACACCGGCCCTGAGCGTCTGGGGAGTCTTTCCTCCCGAAGGCCCAGCAGCATG\\n//\\n//\\nContig0:95645-95887;size=10\\nAATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTACGTGTAAAGGGCATCCTT-----------------------------------------------------------------------------------------------AAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATG\\nContig0:95645-95887;size=4\\nAATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTAGGTGTAAAGGGCATCCTT-----------------------------------------------------------------------------------------------AAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATG\\n//\\n//\\nContig0:98705-98903;size=1\\nCATGCAGTGTAGCCGCTGTTTGTTGGCAGTGCGACAAAAAACTTCCACCATCAATGAGTGGCGTTTACATT---------------------------------------------------TCGCGGTAAAACTTTTGTCTTTCGGGGTGGGGTGGGGATGTTTGGGTTAACACCACAAAAGTTTTGTCATTCAATT\\n//\\n//\\nContig0:99204-99415;size=1\\nCATGTGAGGGCAGGAGAAATGCCAAATCCAATCACTGCCTAGGGACAACTCTGACCTTTACCCCGAACAGC--------------------------------------------------------------------------------------------CCAGGTTTGATCCCTCACTGGTGAGGAGTCCTGCAGAGGTAGCACATG\\nContig0:99204-99415;size=1\\nCATGTGAGGGCAGGAGAAATGCCAAATCCAATCACTGCCTAGGGACAACTCTGACCTTTACCCCGAACAGC----------------------------------------------------------------------------------------------------------TCACTGGTGAGGAGTCCTGCAGAGGTAGCACATG\\n//\\n//\\nContig0:101553-101788;size=4\\n----CATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGC---------------------------------------------------------------------------------------CGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT\\nContig0:101553-101788;size=3\\nCATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGG----------------------------------------------------------------------------------------CGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT\\nContig0:101553-101788;size=1\\nCATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAG-----------------------------------------------------------------------------------------CGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT\\n//\\n//\\nContig0:102134-102336;size=1\\nAATTTGGCCTGTTCTCCTTGCACTAGAAGTTGATATTTCCCTGCACATACGCTGCTCTTGCACGGACTCTTCTGTT-------------------------------------------------------CTCTCCTCCTTGCTCTAACACTCAAGTAAAAAATCGCCATCCCACATTATACTCCCCAATCATCCCGCATG\\n//\\n//\\nContig0:103469-103709;size=16\\nCATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGC---------------------------------------------------------------------------------------------CCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT\\nContig0:103469-103709;size=1\\nCATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGC---------------------------------------------------------------------------------------------CCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCTGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT\\n//\\n//\\nContig0:104258-104379;size=1\\nAATTTTGAAAGATGTTACTATTTATTTAGTTTTTGGCTCGTTGTCGTTCCTGATTGACTTTGTTTTGCTGGAGCAGTTGGGAGGGATTATTTCACTCTTCTTGGCAGCTTCCACATACATG\\n//\\n//\\nContig0:104739-105191;size=1\\nAATTACCAAGGGTTCCAGGTGGGCTTGTACCACTCGTGCTTCAGCCCGTGCTGTTATAGCTTCGCTGCTTCTGTAC----------------------------------------------------------------------------------------------------------CTTAATGTGCTAGCGAAAGGGTTGATGAATT-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:104739-105191;size=1\\nAATTACCAAGGGTTCCAGGTGGGCTTGTACCACTCGTGCTTCAGCCCGTGCTGTTATAGCTTCGCTGCTTCTGTAC----------------------------------------------------------------------CGTCAACAAACCTTTAGTCTCAATCAGCGACCTCCACTTAATGTGCTAGCGAAAGGGTTGATGAATTCATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:104739-105191;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCATGTAACGATTAAAAAACGACAACAACAATAAACAGCACAAAAGTATATTACTTTAGTGTCCACCACAGCA------------------------------------------------------------------------------------------------AAAAAAGATACCCGGACGCCAGCAGTTTTCACCCTGTAACGGGAGAAAAGCCACTCTATGAAGTCCACATG\\n//\\n//\\nContig0:106701-106806;size=1\\nCATGGCCTTGGTGAGAGAGCTCTGTTCCAGAATCTGAGAGTCTCTTGTCACCACACACTTCTTCAGCTCCAGTATTTAGAGCCCCGGGCTGGGATTCACAAAATT\\n//\\n//\\nContig0:107637-108115;size=13\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAA------------------------------------------------------------------------------------------------AGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT\\nContig0:107637-108115;size=1\\nCATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATC----------------------------------------------------------------------------------GGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATT---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:107637-108115;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAA--------------------------------------------------------------------------------------------------------AGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGTCAGACTGAAAAATT\\n//\\n//\\nContig0:110181-110429;size=3\\nAATTGCATTTGTAAACCATTTCACGACTTTGATGCACGATTTGCATGCATGATTTATCCCGGTCATATGAGATGAT-----------------------------------------------------------------------------------------------------ACAACAAAAAACAGCACCCCGAGCTCAGTGCCCCCCAACCTGGCAGCCCAGGTGGTTGCCTGGGTCACATG\\n//\\n//\\nContig0:111104-111195;size=1\\nCATGCAAGCAGACCAGAGCTGCCCCAAAGCACTGACGGTGGCCCCTTGACCTGTTTGAAATGCTACATTTGCCCCAAGCTGCAAGTGAATT\\n//\\n//\\nContig0:111481-111551;size=1\\nAATTAAAAATAGGCTGCGGCAAGAATGGATCCTCTGAACAGCGAAGGGAGGTACGTGGCAGGCTCCATGC\\n//\\n//\\nContig0:112880-113019;size=1\\nCATGCTGCTGCTTCCAGGAGCTGCCTGAGGTAAGCGCTGCCCAGAGCCTGCAACTCGGCTATCTCAAAAACTGACTGCGACAGTGAAAAATGAACAGTTCAAGGCTGAGAGCATCTGGTGATAAACTAACAAAAAAATT\\n//\\n//\\nContig0:113494-113752;size=1\\nAATTTCCATACCCAGATGTGTCCACGCCTGGACTCTACAGCATGCCGCAATCACATCTTGATAGCTGTGTCTACAC---------------------------------------------------------------------------------------------------------------ACAGGATTGTACTTGGGTGCTCAGCAACCCCAACCAGGGGCGGCTCTAGACATTTTGCCACCCCAAGCATG\\n//\\n//\\nContig0:120824-121022;size=1\\nAATTAGAGCAACCTCGAGGCTGCTCTACCTTACAGTAGGAGCAGGATAGACTCCCGGATAGTTCCAGAATATGATC---------------------------------------------------CACCAAAAGCTGCTGGGCTGGGACAGAGAACTGAGGTCTGTTTCTCTCAAGAAAGAACCAGGTTTCACATG\\nContig0:120824-121022;size=1\\nAATTAGAGCAACCTCGAGGCTGCTCTACCTTACAGTAGGAGCAGGATAGACTCCCGGATAGTTTCAGAATATGATC---------------------------------------------------CACCAAAAGCTGCTGGGCTGGGACAGAGAACTGAGGTCTGTTTCTCTCAAGAAAGAACCAGGTTTCACATG\\n//\\n//\\nContig0:122245-122688;size=1\\nAATTCCTCTAAAGCCGGTGTTCTGGGGGACGGACTGACTCAGGGGTACGGGTAAGGGAATAGATCACAGATCACTT-------GGTCCAGGATTGTACTTCCCAAATATGACGACTATATTGTACCCTTGCCTGTGCAAAAGGAGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:122245-122688;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGCATCGACTTCAAGAAACGGTCCCTCCGGGGCAAGGCAGAGGCAGGCT------------------------------------------------------------------AGTAGGATGGCCAAAGGGTTATACCCTTCAGATCTCCTTTACAGGTGCCATCTTCCGAGTCCGACTGCATG\\nContig0:122245-122688;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGCATCGACTTCAAGAAACGGTCCCTCCGGGGCAAGGCAGAGGCAGGCTTGTAGGGTATGGGATGGGGGAAA-------------------------------------------AGTAGGATGGCCAAAGGGTTATACCCTTCAGATCTCCTTTACAGGTGCCATCTTCCGAGTCCGACTGCATG\\n//\\n//\\nContig0:124322-124627;size=1\\nAATTACCCACCTTTCAGAAGGCTGATTAATGACATGGCTCTGCAGCATGCACAGTAGCTCAACCTCCTGCATGAAA-----------------------------------------------------------------------CAAGCTATTCTTTCCCAGATCTGGCGTAGGAAGTGCGGCGCTTCCAGCGGCCAGGGAAGATCCCTGCCATG---------------------------------------------------------------------------------------\\nContig0:124322-124627;size=1\\n---------------------------------------------CATGCACAGTAGCTCAACCTCCTGCATGAAACACACTCATCTCTCTCTCTCCTGAGATTGACAGGTCACGG-----------------------------------------------------------------------------------------------------------------GGCTTTGTAGCAAACTCAGCAGAAGCGGCTTCTCACAGAGTCAATCCTGGATACAGAAGATTTGCCATAGCAAATT\\n//\\n//\\nContig0:127068-127278;size=1\\nCATGATCCTGGGAGCAGCCAGTGAAGAAAGCGAAATGCCATTATAGTGGG-----------------------------------------------------------------------------------------AGGTGGGGCGAGCTGAGGACATCTTCCCAGACCATCTCTGAGCAGCACAGAGGGTGGATCCCAGCTCCATG\\n//\\n//\\nContig0:128188-128337;size=1\\nCATGGCTTCCTCTTTTATAAGGCTCAGGTGCCTTCCCTTAAGCCCAGTCGGGCAGCAGCTAATCAGACACA--TGCCCTGGACGCCCCCCCTCCCTTCTGCTTCCTCCCTTAAAGGGGCCAGTCACCCTGTGACAGCAACATTTCAATT\\n//\\n//\\nContig0:129266-129465;size=1\\nAATTTACAAAATATGAAAGGAAACACTTCTTCACATAGCCTGTGGTACGCACTGCCCAGAGGGGCCAGAGAGTCCA------------------------------------------------------------------------------------CCTCAAGCTTCAGGACATCAACTGGTCAGGAAGGAATTT\\n//\\n//\\nContig0:132993-133221;size=2\\nAATTAATTCTGTTTTGTGTGTGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGAT---------------------------------------------------------------------------------ATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCTGTAATCAAAACCAGCATG\\nContig0:132993-133221;size=1\\n----AATTCTGTTTTGTGTGTGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGATGGTC-----------------------------------------------------------------------------ATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCTGTAATCAAAACCAGCATG\\n//\\n//\\nContig0:135395-135645;size=11\\nAATTACCTGGGACTCCTGCCTGGCTGGGGAGATCTGCTAGGCAATCAGATCCCACCTTCAGCACAGGGTATTCTAA-------------------------------------------------------------------------------------------------------CCGGAGGTGGCAAGCTTTACCCACCCCGTAATCAGCGCTGCGGCAAGCTGCATTATACTCCCAGAATCATG\\nContig0:135395-135645;size=1\\nAATTACCTGGGACTCCTGCCTGGCTGGGGAGATCTGCTAGGCAATCAGATCCCACCTTCAGCACAGGGTATT-----------------------------------------------------------------------------------------------------------CCGGAGGTGGCAAGCTTTACCCACCCCGTAATCAGCGCTGCGGCAAGCTGCATTATACTCCCAGAATCATG\\nContig0:135395-135645;size=1\\nAATTACCTGGGACTCCTGCCTGGCTGGGGAGATCTGCTAGGCAATCAGATCCCACCTTCAGCACAGGGTATTCTAA---------------------------------------------------------------------------------------------------------------------------------CGTAATCAGCGCTGCGGCAAGCTGCATTATCCTCCCAGAATCATG\\nContig0:135395-135645;size=1\\nAATTACCTGGGACTCCTGCCTGGCTGGGGAGATCTGCTAGGCAATCAGATCCCACCTTCAGCACAGGG-----------------------------------------------------------------------------------------------------------------GGAGGTCGCAAGCTTTACCCACCCCGTAATCAGCGCTGCGGCAAGCTGCATTATACTCCCAGAATCATG\\n//\\n//\\nContig0:137333-137570;size=2\\nCATGGTGTTCTATCTGCTGAGCGTGTTCTGAGGCTTTTCAACCTAAAAAGTAAGTAAAACTGGAAATG---------------------------------------------------------------------------------------------ACCTGCGTATTTTACCCTTCTCTCCCCAAAATCACGTCTGAACAAAACCCATGGAATGCTCAATACATAGTCAATT\\n//\\n//\\nContig0:137831-137897;size=1\\nCATGTTTCCTAGCAACCTTAAACTTCAAGTTGACTAAGTTTGTTGTCTGGAAACCTACTATAAATT\\n//\\n//\\nContig0:139289-139543;size=1\\nAATTGAACCTGGCTTGACTGAGTTCTAGGCTCATGCCCCAGCCTCTGCACTGTGCTCCTTCTTACCAAGAGCCATC-----------------------------------------------------------------------------------------------------------TACGGTCATGTCGTACGTAGGGCTCTGTCCTGCTCCCACTGCAGCCAGTGGCAAACTCCCGTCACTTCATG\\n//\\n//\\nContig0:140146-140422;size=1\\nAATTCATTTGAATATAAATACTGTACTTACAAGTCAGTGTGTAGAGCAGTATAAACAGCTCATTGTCTGTATGAAG---------------------------------------------------------------------------------------------------------------------------------TGAAGAACCACTGCTTTAAAGGGCAGTGTGGGAGGTTGGAAAGTTGTATGATTGTTAGGAGGGACAGAGCA\\n//\\n//\\nContig0:141889-142575;size=3\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGGGATCTGAAAGGCTGTGGCAGCCAGAGAAAGAGGCGACTTTCCCCAGCTCCAGGGCTGCAGCTGCTGGGG---------------------------------------------------------------------------------------------------GAGGCATGGAAGAGAGGGCACATCCATTGCATTAGAAAGGCAAAACTACTGATATGAAAATATGAGTCATG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:141889-142575;size=1\\nCATGCTCCATTCAGGTGGNCCGCGGACAGTTCCCTCTAAGACACGCGCCTGTGTGGCTGCACACGAGAGAATAAAGGACCACCCACTTAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:141889-142575;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGGGATCTGAAAGGCTGTGGCAGCCAGAGAAAGAGGCGACTTTCCCCA---------------------------------------------------------------------------------------------------------------------------GAGGCATGGAAGAGAGGGCACATCCATTGCATTAGAAAGGCAAAACTACTGATATGAAAATATGAGTCATG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:141889-142575;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGGGATCTGAAAGGCTGTGGCAGCCAGAGAAAGAAGCGACTTTCCCCAGCTCCAGGGCTGCAGCTGCTGGGG---------------------------------------------------------------------------------------------------GAGGCATGGAAGAGAGGGCACATCCATTGCATTAGAAAGGCAAAACTACTGATATGAAAATATGAGTCATG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:141889-142575;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGGGATCTGAAAGGCTGTGGCAGCCAGAGAAAGAGGCGACTTTCCCCAGCTCCAGGGCTGCAGCTGCTGGGG---------------------------------------------------------------------------------------------------GAGGCATGGAAGAGAGGGCACATCCATTGCATTAGAAACGCAAAACTACTGATATGAAAATATGAGTCATG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:141889-142575;size=1\\nn//\\n//\\nContig0:143165-143277;size=1\\nCATGGCTGGTGGAGGATGCTCAGTAACACCTAATACCTCTGCTGGCAGATATTGTTAACCCCATCTTTAGAGAGGGAAATCTGCCCTAAAGCCTGCCACATTTTGACCAATT\\n//\\n//\\nContig0:143580-143810;size=8\\nAATTTTTCTTCTGGTTTGTCACAGTTTGCTAATATTCCTGCTCAGAGGGAAAAAAAAAGTCATACTTTAGACTTTG-----------------------------------------------------------------------------------CTTCCAACTCTTACCTGCTGCTAATCTTTGTTCAAACGTTGATGGCTCCAGGACGTGCCTTGGCCAACATG\\nContig0:143580-143810;size=1\\nAATTTTTCTTCTGGTTTGTCACAGTTTGCTAATATTCCTGCTCAGAGGGAAAAAAAA-------------------------------------------------------------------------------------------------------------------------GCTAATCTTTGTTCAAACGTTGATGGCTCCAGGACGTGCCTTGGCCAACATG\\nContig0:143580-143810;size=1\\nAATTTTTCTTCTGGTTTGTCACAGTTTGCTAATATTCCTGCTCAGAGGGAAAAAAAAAGTCATACTTTAGACTTTG------------------------------------------------------------------------------------------------CCTGCTGCTAATCTTTGTTCAAACGTTGATGTCTCCAGGACGTGCCTTGGCCAACATG\\nContig0:143580-143810;size=1\\nAATTTTTCTTCTGGTTTGTCACAGTTTGCTAATATTCCTGCTCAGAGGGAAAAAAAAAGCCATACTTTAGACTTT------------------------------------------------------------------------------------CTTCCAACTCTTACCTGCTGCTAATCTTTGTTCAAACGTTGATGGCTCCAGGACGTGCCTTGGCCAACATG\\n//\\n//\\nContig0:145164-145418;size=5\\nAATTTAACGTCCATTAGACATTCTGAAGGGGTAACGCTGAGTTAGACATTGGGTGCTGGTATTGTGATGGGGAAAG-----------------------------------------------------------------------------------------------------------CATCTGACAAATGGTCCCAGTGCCCTCCGAGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\\nContig0:145164-145418;size=1\\nAATTTAACGTCCATTAGACATTCTGAAGGGGAAACGCTGAGTTAGACATTGGGTGCTGGTATTGTGAT------------------------------------------------------------------------------------------------------------------------GACAAATGGTCCCAGTGCCCTCCGAGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\\nContig0:145164-145418;size=1\\nAATTTAACGTCAATTAGACATTCTGAAGGGGTAACGCTGAGTTAGACATTGGGCGCTGGT---------------------------------------------------------------------------------------------------------------------------CATCTGACAAATGGTCCCAGTGCCATCCGAGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\\nContig0:145164-145418;size=1\\nAATTTAACGTCCATTAGACATTCTGAAGGGGTAACGCTGAGTTAGACATTGGGTGCTGGTATTGTGATGGGGAAAG----------------------------------------------------------------------------------------------------------------GACAAATGGTCCCAGTGCCCTCCGAGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\\nContig0:145164-145418;size=1\\nAATTTAACGTCCATTAGACATTCTGAAGGGGTAACGCTGAGTTAGACATTGGGTGCTGGTATTGTGATGGGGAAAG-----------------------------------------------------------------------------------------------------------CATCTGACAAATGGTCCCAGTGCCCTCCGGGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\\n//\\n//\\nContig0:145642-145890;size=1\\nCATGGGCAGCGGACGTGTGTTGGCGTGGCTCTCTGCTGTGTACCAGGCCTTATTAACACAAAACGGTACGC-----------------------------------------------------------------------------------------------------TGTCAAATAGCAACAGGGCTGACCCAAAACCCATGTCCAAACATCCAAGAACTTCACAAAAGTTCAACTCTCAATT\\nContig0:145642-145890;size=1\\nCATGGGCAGCGGACGTGTGTTGGCGTGGCTCTCTGCTGTGTACCAGGCCTTATTAACACACAACGGTACGC-----------------------------------------------------------------------------------------------------TGTCAAATAGCAACAGGGCTGACCCAAAACCCATGTCCAAACATCCAAGAACTTCACAAAAGTTCAACTCTCAATT\\nContig0:145642-145890;size=1\\nCATGGGCAGCGGACGTGTGTTGGCGTGGCTCTCTGCTGTGTACCAGGCCTTATTAACACAAAACGGTACGC--------------------------------------------------------------------------------------------------------CAAATAGCAACAGGGCTGACCCAAAACCCATGTCCAAACATCCAAGAACTTCACAAAAGTTCAACTCTCAATT\\nContig0:145642-145890;size=1\\nCATGGGCAGCGGACGTGTGTTGGCGTGGCTCTCTGCTGTGTACCAGGCCTTATTAACACAAAACGGTACGC-------------------------------------------------------------------------------------------------AAGGTGTCAAATAGCAACAGGGCTGACCCAAAACCCATG-----------------------------------------\\n//\\n//\\nContig0:146117-146349;size=1\\nAATTAAACACGGGGTAAGCCACTGCCATGCTCTGACCCAAAGGTGGCTGGTGTATTTTCAGTAGCGGGAATGCACG-------------------------------------------------------------------------------------CCTAGAGCCCTGGGCCAGAGCAAGTGAGGGCCCCCCCACCCCCTCCAGTCCTGCCTCCAGCCCTGTTCATG\\nContig0:146117-146349;size=1\\nAATTAAACACGGGGTAAGCCACTGCCATGCTCTGACCCAAAGGTGGCTGGGGTATTTTCAGTAGCGGGAATGCACG-------------------------------------------------------------------------------------CCTAGAGCCCTGGGCCAGAGCAAGTGAGGGCCCCCCCACCCCCTCCAGTCCTGCCTCCAGCCCTGTTCATG\\n//\\n//\\nContig0:148378-148953;size=2\\nCATGCTGCAGCTAGGGGGAAATGGGGGCAGGGCCGGGGGAACCTCACCCTCCCCAGCTGCGAATCCTGGGA---------CGATCCGACCCGCGGACTGGAGTTCTTTACCCACCCCCTGGCCCTTTAACAACCGGTTCTCCACGGAGGTCTAATT-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:148378-148953;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTCCTGGTTCACTTCTTCGTCTCTTATAACTTTTGTATTGAGTGCAATACATAGGGCTTGCAAAAATG---------------------------------------------------------------------TCTGGCTGTCCTGTGGAGATTTACAAACCAACCTGCTTCATTGACAGTGCAGTGGGGTTTTCTTCTGTTTCAAATT\\nContig0:148378-148953;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTCCTGGTTCACTTCTTCGTCTCTTATAACTTTTGTATTGAGTGCAATACATAGGGCTTGCAAAAATG------------------------------------------------------------------------------------------------------------ATTGACAGTGCAGTGGGGTTTTCTTCTGTTTCAAATT\\n//\\n//\\nContig0:149397-149596;size=2\\nAATTTGAGTTCAGCTGTATTTGCAAAGAGGTGAAACACTCCAAACCAAAATGAAACATTTCGTTTGACAGAACTGT----------------------------------------------------CTCAGCCAACCAACTGAAAAATCTGTTATTTTAACAGCTCTACCCAGGAAGCAAGAGCCAGAGCAGGCATG\\n//\\n//\\nContig0:150178-150481;size=1\\nAATTCTTCGGAAGAGCCTTTTTTTCTAACCTGTGATGCAAGCTGGTCCATTTCAAACACCAGGTATCTCCAAAACAATCCCCACAGGCAGCTACCCCCCTCCAAGATCTCCAAGCACCACTTCAAATAGCGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:150178-150481;size=1\\nAATTCTTCGGAAGAGCCTTTTTTTCTAACCTGTGATGCAAGCTGG-----------------------CCAAAACAATCCCCACAGGCAGCTACCCCCCTCCAAGATCTCCAAGCACCACTTCAAATAGCGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:150178-150481;size=1\\n------------------------------------------------------------------------------------------------------------------------------------CATGTGGAATCCTGTGCTATCAATCTGCTCCTCAAAAGAGCCCCCATGAACAAAGCAGAGATGTTCCTAGC------------------------TAGCAAATCCCAGCCTGGCCCTGGGTTCAGACAAGCTCAAACCAGCACAATACAAAGAGAAGAGCAAAGTCTAATT\\n//\\n//\\nContig0:150767-151006;size=17\\nAATTTGATACCATATTCTTCTCTATTTTCTGTCCTCTGTGCACCAAAATCCCACCAAAGCAGGATGTTTGGGGGGT--------------------------------------------------------------------------------------------GTAGATGTGTTTTGGTTGCAGGGAGGCATTCTCTGGGGACATACACATAGGAATGCCTGTATGTCTACATG\\nContig0:150767-151006;size=1\\nAATTTGATACCATATTCTTCTCTATTTTCTGTCCTCTGTGCACCAAAATCCCACCAAAGCAGGATGTTTGGGGGG---------------------------------------------------------------------------------------------GTAGATGTGTTTTGGTTGCAGGGAGGCATTCTCTGGGGACATACACATAGGAATGCCTGTATGTCTACATG\\n//\\n//\\nContig0:151911-152069;size=1\\nAAATTGCATCGAAGAGCACAGAGGCAGGG----------------------------------------------------------TAAACCTACGCTAATGAACACAAATATAGAAGAGAAGGATCACCCAGCAGTGTTAAGGAAGCCGCCGCATG\\n//\\n//\\nContig0:152674-152906;size=7\\nAATTTCCCAGCTTCTGGGGAGTCCTGGAGAGGAGGAAGAATCCGGATCCCTTTGCAGACGGCATTGCTGCTGCACT----------------------------------------------------------------------------------------CTCTGAAGAAGGGCCAGGGGTCAGGGGTGAAAGTATGGGCTTCGTTGTGAGGAAGTGTCCCTGCCATG\\nContig0:152674-152906;size=1\\nAATTTCCCAGCTTCTGGGGAGTCCTGGAGAGGAGGAAGAATCCGGATCCCTTTGCAGACGGCATTGCTGCTGCACT----------------------------------------------------------------------------------------CTCTGAAGAAGGGCAAGGGGTCAGGGGTGAAAGTATGGGCTTCGTTGTGAGGAAGTGTCCCTGCCATG\\n//\\n//\\nContig0:153152-153312;size=1\\nCATGTACGTCGCACGACTGAATCAATGGGGCTCCTCACCCACTTAAAGGGAATCGTGGGGTTCAGTGCTTT-------------AGGTTTGTTTTGCCACCTCCTGGACTGGCACCTACGAGTCTAAGGCTAAATATTATTTCTAGTAGGGCTGCCAATT\\n//\\n//\\nContig0:153761-154003;size=17\\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA-----------------------------------------------------------------------------------------------CTTTTTAAACCCAGGTGCCCTGATTAGCCTGCTTTGATTGGCTGGAGGTGATCTAATCAGCCTGTCTGCCTTAATT\\nContig0:153761-154003;size=1\\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA-----------------------------------------------------------------------------------------------CTTTTTAAACCCAGGTGCCCTGATGAGCCTGCTTTGATTGGCTGGAGGTGATCTAATCAGCCTGTCTGCCTTAATT\\nContig0:153761-154003;size=1\\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGG-------------------------------------------------------------------------------------------------------------------------------------------------TTTGATTGGCTGGAGGGGATCTAATCAGCCTGTCTGCCTTAATT\\nContig0:153761-154003;size=1\\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA------------------------------------------------------------------------------------------------TTTTTAAACCCAGGTGCCCTGATTAGCCTGCTTTGATTGGCTGGAGGTGATCTAATCAGCCTGTCTGCCTTAATT\\nContig0:153761-154003;size=1\\nCATGGCCTCCTCCAAACACCTTCTTTATCCTCACCACAGGACCTTCCTCCTGGTGTCTGAT-------------------------------------------------------------------------------------------------------------------------------------------TGATTAGCTGGAGGTGATCTAATCAGCCGGTGTGCTTTAATT\\nContig0:153761-154003;size=1\\nCATAGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA-----------------------------------------------------------------------------------------------CTTTTTAAACCCAGGTGCCCTGATTAGCCTGCTTTGATTGGCTGGAGGTGATCTAATCAGCCTGTCTGCCTTAATT\\nContig0:153761-154003;size=1\\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA--------------------------------------------------------------------------------------------------TTTAAACCCAGGTGCCCTGATTACCCTGCTTTGATTGGCTGCAGGTGTTCTAATCAGCCAGTCTGCCTTAATT\\n//\\n//\\nContig0:155102-155187;size=1\\nAATTCAGTTCCCCATCACCTGGGGCCAGGACTCTTCTGGGCAGCTTCAGCAGGGACGGGAGGCCAGAGACTGTGTGTGGTCCATG\\n//\\n//\\nContig0:157369-157580;size=1\\nAATTCCTGGCTCTGCTACCGTCTAGCTGTGTGATGGTGCA-----------------------------------------------------------------------------------------------TCCTCTGATGAGAGATGCTATTTCTTAGCTCCTACATATTAGCCAAAGGCTATTTAACCCACTTTCCAGTGTAATT\\n//\\n//\\nContig0:158525-158758;size=7\\nAATTAGATCTTGTGCACAATGAGACAGGACGAGAACCTGGGCCTTTCCAGTCCAGAGACACAAGCCTCCACCACCA--------------------------------------------------------------------------------------ACCCCAGCAGACACGCAGGGCTTTGCTGCAACTGCGCCCCTTGGCAGCTGACATTGCTTATGTCTAACATG\\nContig0:158525-158758;size=3\\nAATTAGATCTTGTGCACAATGAGACAGGACGAGAACCTGGGCCTTTCCAGTCCAGAGACACAAGCCTCCACCACCA---------------------------------------------------------------------------------------CCCCAGCAGACACGCAGGGCTTTGCTGCAACTGCGCCCCTTGGCAGCTGACATTGCTTATGTCTAACATG\\nContig0:158525-158758;size=1\\nAATTAGATCTTGTGCACAATTAGACAGGACGAGAACCTGGGCCTTTCCAGTCCAGAGACACA----------------------------------------------------------------------------------------------------ACCCCAGCAGACACGCAGGGCTTTGCTGCAACTGCGCCCCTTGGCAGCTGACATTGCTTATGTCTAACATG\\n//\\n//\\nContig0:160325-160508;size=1\\nAATTTGCAAGGCAGGGAGCT--CAGTGTCTGCTCCAAAAATCCGCGCTCTCTGTCTCCCCGATGCTCCCTGTCACACT-----------------------------------------------------TAGCTGCCCACAATGCACCACTCCCAACAGCGCTGCAAATGTGGCCACACTT\\n//\\n//\\nContig0:161569-161784;size=1\\nAATTGAAATAGCAAGGAGGGTGCTCAGTGACGGTGGGCATGATATAAAAACCCAAACAGATCGGAAGTCAGTGGCA--------------------------------------------------------------------CCTGCATCAAAAAGAAGAGACTTCCTGAAATAGACAAGTGTCTCAGCAATGAGACAGGCTCAGAGAACATG\\n//\\n//\\nContig0:163234-163571;size=1\\nAATTCTGGCTCATTGAACTTCATTTACTCATTGCTGCAATCCCAAGCCTTCAAAAATCATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:163234-163571;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGGCTGCAGTTTAACTTTTTTGTTTTGAAAGTAAAGCAGTGATTCTCCTGCAATCTTGTGGGGCCTTAAGACAGACATCAAACATCCTGAGATTCGCAATAAGAACTCAATT\\n//\\n//\\nContig0:165831-166037;size=1\\nAATTTCCTGTCCTCCTTCTACCCAAAGCTTCCCCTCCCCACAACGTGCCTCAGGCCCAAAGAATGACCTCGGAAGC-----------------------------------------------------------CCGTGGACTGCCCCAGTGGAGGCATGACAAGATGGACCGTGCGAGTTCCCCCTTTTCTGTGCTGCCCCATG\\nContig0:165831-166037;size=1\\nAATTTCCTGTCCTCCTTCTACCCAAAGCTTCCCCTCCCCACAACGTGCCTCAGGCCCAAAGAATGACCTCGGAAGCATG-------------------------------------------------------------------------------------------------------------------------------\\n//\\n//\\nContig0:169197-169391;size=1\\nAATTAGCAAGGAGGTTTGTCCTCTTGCCCCCCAGGGGCATCTTTGTTGGGGCTGGTTCAGACTGATTGCATAA------------------------------------------------------CCCAGTGACCTCCAAGCACTTTACAAACCTGCCTCAGGGGCCTTACACAGGAGGAAAAGGAGACATG\\nContig0:169197-169391;size=1\\nAATTAGCAAGGAGGTTTGTCCTCTTGCCCCCCAGGGGCATCTTTGTTGGGGCTGGTTCAGACTGATTGCATAAGCT-----------------------------------------------TTCACCCAGTGACCTCCAAGCACTTTACAAACCTGCCTCAGGGGCCTTACACAGGAGGAAAAGGAGACATG'" ] }, "execution_count": 547, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\\n//\\n//\\n\".join(c).encode()\n" ] }, { "cell_type": "code", "execution_count": 397, "metadata": {}, "outputs": [], "source": [ "regions = (i.split(\"\\t\") for i in fullregions)\n", "regions = ((i, int(j), int(k)) for (i, j, k) in regions)\n", "build_ref_cigars(data, sample)" ] }, { "cell_type": "code", "execution_count": 438, "metadata": {}, "outputs": [], "source": [ "reg = ('Contig0', 41920, 42479)\n", "reg = ('Contig0', 7219468, 7220326)\n", "reg = ('Contig0', 207998, 208219)\n", "reg = ('Contig0', 16738, 16933)\n", "reg = ('Contig0', 49781, 50005)\n", "reg = ('Contig0', 76480, 76711)\n", "reg = ('Contig0', 24391, 24908)\n", "reg = ('Contig0', 7193, 7952) # has merge\n", "#reg = ('Contig0', 13327, 13845) # has indels\n", "reg = ('Contig0', 76480, 77015) # has indels\n", "\n", "reg = ('Contig0', 346131, 346193) # valueerror\n", "\n", "samfile = AlignmentFile(\n", " os.path.join(data.dirs.refmapping, \n", " \"{}-mapped-sorted.bam\".format(sample.name)),\n", " 'rb')\n", "ref = get_ref_region(data.paramsdict[\"reference_sequence\"], *reg)\n", "reads = list(samfile.fetch(*reg))" ] }, { "cell_type": "code", "execution_count": 519, "metadata": {}, "outputs": [], "source": [ "\n", "def cigared(sequence, cigartups):\n", " start = 0\n", " seq = \"\"\n", " for tup in cigartups:\n", " flag, add = tup\n", " if flag is 0:\n", " seq += sequence[start:start + add]\n", " if flag is 1:\n", " pass\n", " if flag is 2:\n", " seq += \"-\" * add\n", " start -= add\n", " if flag is 4:\n", " pass\n", " start += add\n", " return seq \n", "\n" ] }, { "cell_type": "code", "execution_count": 456, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CATGTTCAGTGTAGTATGAAGCGGGGTAGCACTTTCCACTGCAGAAGTAGGTCTCATTGTGTGCGTAGTGC [(4, 15), (0, 25), (1, 3), (0, 16), (1, 1), (0, 5), (1, 1), (0, 5)]\n", "(4, 15)\n", "15 15 \n", "(0, 25)\n", "40 40 ATGAAGCGGGGTAGCACTTTCCACT\n", "(1, 3)\n", "43 43 ATGAAGCGGGGTAGCACTTTCCACT\n", "(0, 16)\n", "59 59 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTG\n", "(1, 1)\n", "60 60 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTG\n", "(0, 5)\n", "65 65 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTGGTGCG\n", "(1, 1)\n", "66 66 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTGGTGCG\n", "(0, 5)\n", "71 71 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTGGTGCGAGTGC\n", "ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTGGTGCGAGTGC\n" ] } ], "source": [ "# match paired reads together in a dictionary\n", "rdict = {}\n", "for read in reads:\n", " if read.qname not in rdict:\n", " rdict[read.qname] = [read]\n", " else:\n", " rdict[read.qname].append(read)\n", "\n", "# sort keys by derep number\n", "keys = sorted(\n", " rdict.keys(),\n", " key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n", "\n", "# build the cluster based on map positions, orientation, cigar\n", "for key in keys:\n", " r1, r2 = rdict[key]\n", " if r1 and r2:\n", " print(r1.seq, r1.cigar)# r1.pos, r1.aend, r1.rlen, r1.aligned_pairs)\n", " print(cigared(r1.seq, r1.cigar))" ] }, { "cell_type": "code", "execution_count": 310, "metadata": {}, "outputs": [], "source": [ "out = open(\"test2.txt\", 'w')\n", "# build the cluster based on map positions, orientation, cigar\n", "for key in keys:\n", " r1, r2 = rdict[key]\n", " if r1 and r2:\n", " \n", " # empty arrays\n", " aref = np.array(list(ref[1]))\n", " arr1 = np.zeros(aref.size, dtype=\"U1\")\n", " arr2 = np.zeros(aref.size, dtype=\"U1\")\n", " arr1.fill(\"-\")\n", " arr2.fill(\"-\")\n", "\n", " # how far ahead of the start does this read begin\n", " start = r1.reference_start - reg[1] \n", " seq = cigared(r1.seq, r1.cigar)\n", " arr1[start:start+len(seq)] = list(seq)\n", " \n", " seq = cigared(r2.seq, r2.cigar)\n", " start = r2.reference_start - reg[1] \n", " arr2[start:start+len(seq)] = list(seq)\n", " #print(\"\".join(arr1), file=out)\n", " #print(\"\".join(arr2), file=out)\n", " \n", " arr3 = join_arrays(arr1, arr2)\n", " print(\"\".join(arr3), file=out)\n", " \n", "out.close()" ] }, { "cell_type": "code", "execution_count": 307, "metadata": {}, "outputs": [], "source": [ "import numba\n", "\n", "#numba.jit(nopython=True)\n", "def join_arrays(arr1, arr2):\n", " arr3 = np.zeros(arr1.size, dtype=\"U1\")\n", " for i in range(arr1.size):\n", " \n", " if arr1[i] == arr2[i]:\n", " arr3[i] = arr1[i]\n", " \n", " elif arr1[i] == \"N\":\n", " if arr2[i] == \"-\":\n", " arr3[i] = \"N\"\n", " else:\n", " arr3[i] = arr2[i]\n", " \n", " elif arr2[i] == \"N\":\n", " if arr1[i] == \"-\":\n", " arr3[i] = \"N\"\n", " else:\n", " arr3[i] = arr1[i]\n", " \n", " elif arr1[i] == \"-\":\n", " if arr2[i] == \"N\":\n", " arr3[i] = \"N\"\n", " else:\n", " arr3[i] = arr2[i]\n", " \n", " elif arr2[i] == \"-\":\n", " if arr1[i] == \"N\":\n", " arr3[i] = \"N\"\n", " else:\n", " arr3[i] = arr1[i]\n", " \n", " else:\n", " arr3[i] = \"N\"\n", " return arr3\n", " \n", "\n", "a1 = np.array(list(\"AGAGAG-NN----\"))\n", "a2 = np.array(list(\"----ACTTNNTTT\"))\n", "de = np.array(list(\"AGAGANTTNNTTT\")) \n", "\n", "a1 = np.array(list(\"AGAGAG-NN-------\"))\n", "a2 = np.array(list(\"----ACTTNNTTT---\"))\n", "de = np.array(list(\"AGAGANTTNNTTT---\"))" ] }, { "cell_type": "code", "execution_count": 308, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['A', 'G', 'A', 'G', 'A', 'N', 'T', 'T', 'N', 'N', 'T', 'T', 'T',\n", " '-', '-', '-'], dtype=' r2.blocks[0][0]:\n", " rx = r2\n", " r2 = r1\n", " r1 = rx\n", "\n", "# get arrs\n", "aref = np.array(list(ref[1]))\n", "arr1 = np.zeros(aref.size, dtype=\"U1\")\n", "arr2 = np.zeros(aref.size, dtype=\"U1\")\n", "\n", "# fill arr1\n", "seq1 = cigared(r1.seq, r1.cigar)\n", "arr1[]" ] }, { "cell_type": "code", "execution_count": 197, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'reference'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mref\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_ref_region\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparamsdict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"reference\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mreg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcigar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# which is forward and reverse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyError\u001b[0m: 'reference'" ] } ], "source": [ "ref = get_ref_region(data.paramsdict[\"reference\"], *reg)\n", "\n", "def cigar(r1, r2, reg):\n", " \n", " # which is forward and reverse\n", " if r1.blocks[0][0] > r2.blocks[0][0]:\n", " rx = r2\n", " r2 = r1\n", " r1 = rx\n", " \n", " # get arrs\n", " aref = np.array(list(ref[1]))\n", " arr1 = np.zeros(aref.size, dtype=\"U1\")\n", " arr2 = np.zeros(aref.size, dtype=\"U1\")\n", " \n", " # fill arr1\n", " \n", " \n", " # do they overlap?\n", " overlap = False\n", " if max(r1.blocks[0]) > min(r2.blocks[0]):\n", " overlap = True\n", " #osegment = r1.blocks[0][1] - r2.blocks[0][0]\n", " #oseqs = r1.seq[-osegment:], r2.seq[:osegment]\n", " #print(oseqs)\n", " \n", " # modify for cigar\n", " for edit in r1.cigar:\n", " r1.seq\n", " \n", " \n", " \n", " before = \"-\" * (r1.pos - reg[1])\n", " if r1.is_reverse:\n", " read = before + revcomp(r1.seq)\n", " else:\n", " read = before + r1.seq\n", " \n", " midns = \"-\" * (r2.pos - r1.aend)\n", " read += midns\n", " \n", " if r2.is_reverse:\n", " read += r2.seq\n", " else:\n", " read += revcomp(r2.seq)\n", "\n", " after = \"-\" * (reg[2] - r2.aend)\n", " read += after\n", " return read\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 350, "metadata": {}, "outputs": [], "source": [ "def get_ref_region(reference, contig, rstart, rend):\n", " cmd = [\n", " ip.bins.samtools, 'faidx', \n", " reference, \n", " \"{}:{}-{}\".format(contig, rstart+1, rend)]\n", " stdout, err = sps.Popen(cmd, stdout=sps.PIPE).communicate()\n", " name, seq = stdout.decode().split(\"\\n\", 1)\n", " listseq = [name, seq.replace(\"\\n\", \"\")]\n", " return listseq" ] }, { "cell_type": "code", "execution_count": 250, "metadata": {}, "outputs": [], "source": [ "ireg = samfile.fetch(*reg)\n", "rdict = {}fo\n", "for read in ireg:\n", " if read.qname not in rdict:\n", " rdict[read.qname] = [read]\n", " else:\n", " rdict[read.qname].append(read)\n", "\n", "clust = dict_to_clust(rdict)" ] }, { "cell_type": "code", "execution_count": 251, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['AATTTATTATGAAACAAAAGGCAAAAAACTGTTATGTACATAGTTTAGTCCTATTGAGTGTCTACTCAGCGCTnnnnCGCTCACTGCTCCGCAGTTCAGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG',\n", " 'AATTTAGACCTATTCGGTGTCTACTCAGCGCTTCTTGGCTTGTCTTTTGTATTTATTTAAnnnnGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG']" ] }, "execution_count": 251, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clust" ] }, { "cell_type": "code", "execution_count": 252, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['>Contig0:24392-24679',\n", " 'AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTGTATTCATTAAATGGAGCATCTCTTGTCACTGTCCAGCAATAGTCTGCAAGCATTGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCGCCGTGCTCGTCGCTCACTGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG']" ] }, "execution_count": 252, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ref = get_ref_region(*reg)\n", "ref" ] }, { "cell_type": "code", "execution_count": 255, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGC\n", "AATTTAGACCTATTCGGTGTCTACTCAGCGCTTCTTGGCTTGTCTTTTGTATTTATTTAA\n" ] } ], "source": [ "print(ref[1][:80])\n", "print(r1.seq)" ] }, { "cell_type": "code", "execution_count": 272, "metadata": {}, "outputs": [], "source": [ "# how for ahead of reference is r1 start\n", "ahead = -1 * (reg[1] - (r1.pos - 1))" ] }, { "cell_type": "code", "execution_count": 274, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTG\n", "---------------------------------------- AATTTAGACCTATTCGGTGTCTACTCAGCGCTTCTTGGCTTGTCTTTTGTATTTATTTAA\n" ] } ], "source": [ "print(ref[1][:90])\n", "print(\"-\"*ahead, r1.seq)" ] }, { "cell_type": "code", "execution_count": 344, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "r AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTG\n", "h AATTTATTATGAAACAAAAGGCAAAAAACTGTTATGTACATAGTTTAGTCCTATTGAGTGTCTACTCAGCGCT\n", "h -----------------------------------------AATTTAGACCTATTCGGTGTCTACTCAGCGCTTCTTGGCTTGTCTTTTG\n", "r AATCGCTCGCCGTGCTCGTCGCTCACTGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG\n", "24679 24608 71\n", "h -----------------------------------------------------------------------\n", "71M\n", "24679 24616 63\n", "h ---------------------------------------------------------------\n", "63M\n" ] } ], "source": [ "\n", "print('r', ref[1][:90])\n", "\n", "for key in rdict:\n", " r1, r2 = rdict[key]\n", " \n", " ahead = -1 * (reg[1] - (r1.pos))\n", " print('h', (\"-\"*ahead + r1.seq)[:90])\n", " #print(r1.cigar)\n", " \n", "print('r', ref[1][-90:])\n", "\n", "for key in rdict:\n", " r1, r2 = rdict[key]\n", " \n", " ahead = (reg[2] - (r2.pos))\n", " \n", " print(reg[2], r2.pos, ahead)\n", " print('h', (\"-\"*ahead))\n", " #print(r1.cigar)\n", "\n", " #print(r2.seq)\n", " #print(r2.get_blocks())\n", " #print(r1.seq)\n", " #print(r1.get_blocks()[0])\n", " #print(r1.aend, r2.get_blocks()[0][0])\n", " #print(r1.cigar)\n", " print(r2.cigarstring)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAAAGGTnnnnAGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG\n", "AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAGAGGTnnnnGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG\n", "AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAAAGGnnnnAGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG\n" ] } ], "source": [ "for read in clust:\n", " print(read)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "not enough values to unpack (expected 2, got 1)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mrdict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mqname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0mclust\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict_to_clust\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m\u001b[0m in \u001b[0;36mdict_to_clust\u001b[0;34m(rdict)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mclust\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mpair\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrdict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mr1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpair\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mposs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_reference_positions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mr2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_reference_positions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mseedstart\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseedend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mposs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mposs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: not enough values to unpack (expected 2, got 1)" ] } ], "source": [ "# access reads from bam file using pysam\n", "samfile = AlignmentFile(\n", " os.path.join(data.dirs.refmapping, \n", " \"{}-mapped-sorted.bam\".format(sample.name)),\n", " 'rb')\n", "\n", "# iterate over all mapped regions\n", "clusters = []\n", "for reg in regions:\n", " ireg = samfile.fetch(*reg)\n", "\n", " # match paired reads by read names for all reads in each cluster\n", " rdict = {}\n", " for read in ireg:\n", " if read.qname not in rdict:\n", " rdict[read.qname] = [read]\n", " else:\n", " rdict[read.qname].append(read)\n", "\n", " clust = dict_to_clust(rdict)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'1c64525c1f5f58dc1f713b0e5e1d0941;size=1': []}" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rdict" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.util import revcomp" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [], "source": [ "def build_ref_clusters_from_cigars(data, sample):\n", " \n", " # get regions from bedtools overlaps\n", " regions = bedtools_merge(data, sample).strip().split(\"\\n\")\n", " regions = (i.split(\"\\t\") for i in regions)\n", " regions = ((i, int(j), int(k)) for (i, j, k) in regions)\n", "\n", " # access reads from bam file using pysam\n", " samfile = AlignmentFile(\n", " os.path.join(data.dirs.refmapping, \n", " \"{}-mapped-sorted.bam\".format(sample.name)),\n", " 'rb')\n", " \n", " # iterate over all mapped regions\n", " clusters = []\n", " for reg in regions:\n", " ireg = samfile.fetch(*reg)\n", " \n", " # match paired reads by read names for all reads in each cluster\n", " rdict = {}\n", " for read in ireg:\n", " if read.qname not in rdict:\n", " rdict[read.qname] = [read]\n", " else:\n", " rdict[read.qname].append(read)\n", "\n", " return dict_to_clust(rdict)" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG',\n", " 'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG',\n", " 'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG',\n", " 'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG',\n", " 'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG']" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "seq = build_ref_clusters_from_cigars(data, sample)\n", "seq" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "def dict_to_clust(rdict):\n", " clust = []\n", " for pair in rdict.values():\n", " r1, r2 = pair\n", " poss = r1.get_reference_positions() + r2.get_reference_positions()\n", " seedstart, seedend = min(poss), max(poss)\n", "\n", " reads_overlap = False\n", " #print(r1.cigartuples, r1.cigarstring, r1.cigar)\n", "\n", " if r1.is_reverse:\n", " if r2.aend > r1.get_blocks()[0][0]:\n", " reads_overlap = True\n", " seq = r2.seq + 'nnnn' + revcomp(r1.seq)\n", " else:\n", " seq = r2.seq + 'nnnn' + r2.seq\n", "\n", " else:\n", " if r1.aend > r2.get_blocks()[0][0]:\n", " reads_overlap = True\n", " seq = r1.seq + 'nnnn' + revcomp(r2.seq)\n", " else:\n", " seq = r1.seq + 'nnnn' + r2.seq\n", " clust.append(seq)\n", " return clust" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "# build clusters for aligning with muscle from the sorted bam file\n", "samfile = AlignmentFile(\n", " os.path.join(data.dirs.refmapping, \n", " \"{}-mapped-sorted.bam\".format(sample.name)),\n", " 'rb')\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5009\n", "TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGC\n", "5009\n", "TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCCC\n", "5009\n", "TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGAACCAGCTGCCCCC\n", "5009\n", "TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGCCCCGACCCAGCTGCCCGC\n", "5009\n", "TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGC\n", "5009\n", "TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG\n", "5009\n", "TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATAGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG\n", "5009\n", "TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATAGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG\n", "5009\n", "TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG\n", "5009\n", "TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCTGCTGCAGTTAACTGGCCTCTTAACCCCG\n" ] } ], "source": [ "chrom, rstart, rend = regions[0].split()\n", "reg = samfile.fetch(chrom, int(rstart), int(rend))\n", "\n", "for read in reg:\n", " print(rstart)\n", " print(read.seq)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "ename": "IndexError", "evalue": "list index out of range", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;31m# file precedence\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mnonm1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0medits1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnonmerged1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mnonm2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0medits2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnonmerged2\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIndexError\u001b[0m: list index out of range" ] } ], "source": [ "nonmerged1 = os.path.join(\n", " data.tmpdir, \n", " \"{}_nonmerged_R1_.fastq\".format(sample.name))\n", "nonmerged2 = os.path.join(\n", " data.tmpdir, \n", " \"{}_nonmerged_R2_.fastq\".format(sample.name))\n", "edits1 = os.path.join(\n", " data.dirs.edits,\n", " \"{}.trimmed_R1_.fq.gz\".format(sample.name))\n", "edits2 = os.path.join(\n", " data.dirs.edits, \n", " \"{}.trimmed_R2_.fq.gz\".format(sample.name))\n", "\n", "# file precedence\n", "nonm1 = [i for i in (edits1, nonmerged1) if os.path.exists(i)][-1]\n", "nonm2 = [i for i in (edits2, nonmerged2) if os.path.exists(i)][-1]\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/home/deren/Documents/ipyrad/tests/4-refpairtest_edits/3L_0.trimmed_R1_.fq.gz'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "edits1" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "infiles = [\n", " os.path.join(data.dirs.edits, \"{}.trimmed_R1_.fastq.gz\".format(sample.name)),\n", " os.path.join(data.dirs.edits, \"{}_R1_concatedit.fq.gz\".format(sample.name)),\n", " os.path.join(data.tmpdir, \"{}_merged.fastq\".format(sample.name)),\n", " os.path.join(data.tmpdir, \"{}_declone.fastq\".format(sample.name)),\n", "]\n", "infiles = [i for i in infiles if os.path.exists(i)]\n", "infile = infiles[-1]\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "split_endtoend_reads(data, sample)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "with open(inp, 'r') as infile:\n", " duo = izip(*[iter(infile)] * 2)\n", " \n", " idx = 0\n", " while 1:\n", " try:\n", " itera = next(duo)\n", " except StopIteration:\n", " break\n", " \n", " r1, r2 = itera[1].split(\"nnnn\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def split_endtoend_reads(data, sample):\n", " \"\"\"\n", " Takes R1nnnnR2 derep reads from paired data and splits it back into \n", " separate R1 and R2 parts for read mapping. \n", " \"\"\"\n", "\n", " inp = os.path.join(data.tmpdir, \"{}_derep.fastq\".format(sample.name))\n", " out1 = os.path.join(data.tmpdir, \"{}_R1-tmp.fastq\".format(sample.name))\n", " out2 = os.path.join(data.tmpdir, \"{}_R2-tmp.fastq\".format(sample.name))\n", "\n", " splitderep1 = open(out1, 'w')\n", " splitderep2 = open(out2, 'w')\n", "\n", " with open(inp, 'r') as infile: \n", " # Read in the infile two lines at a time: (seqname, sequence)\n", " duo = izip(*[iter(infile)] * 2)\n", "\n", " ## lists for storing results until ready to write\n", " split1s = []\n", " split2s = []\n", "\n", " ## iterate over input splitting, saving, and writing.\n", " idx = 0\n", " while 1:\n", " try:\n", " itera = next(duo)\n", " except StopIteration:\n", " break\n", " ## split the duo into separate parts and inc counter\n", " part1, part2 = itera[1].split(\"nnnn\")\n", " idx += 1\n", "\n", " ## R1 needs a newline, but R2 inherits it from the original file \n", " ## store parts in lists until ready to write\n", " split1s.append(\"{}{}\\n\".format(itera[0], part1))\n", " split2s.append(\"{}{}\".format(itera[0], part2))\n", "\n", " ## if large enough then write to file\n", " if not idx % 10000:\n", " splitderep1.write(\"\".join(split1s))\n", " splitderep2.write(\"\".join(split2s))\n", " split1s = []\n", " split2s = [] \n", "\n", " ## write final chunk if there is any\n", " if any(split1s):\n", " splitderep1.write(\"\".join(split1s))\n", " splitderep2.write(\"\".join(split2s))\n", "\n", " ## close handles\n", " splitderep1.close()\n", " splitderep2.close()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'mergepairs' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmergepairs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'mergepairs' is not defined" ] } ], "source": [ "merge_pairs(data, sample, 1, 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('/home/deren/Documents/ipyrad/tests/4-refpairtest_edits/1A_0.trimmed_R1_.fastq.gz',\n", " '/home/deren/Documents/ipyrad/tests/4-refpairtest_edits/1A_0.trimmed_R2_.fastq.gz')]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s3.data.samples[\"1A_0\"].concatfiles" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | concatenating | s3 |\n", "[####################] 100% 0:00:04 | mapping | s3 |\n" ] } ], "source": [ "s3.run()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "def bedtools_merge(data, sample):\n", " \"\"\" \n", " Get all contiguous genomic regions with one or more overlapping\n", " reads. This is the shell command we'll eventually run\n", "\n", " bedtools bamtobed -i 1A_0.sorted.bam | bedtools merge [-d 100]\n", " -i : specifies the input file to bed'ize\n", " -d : For PE set max distance between reads\n", " \"\"\"\n", " LOGGER.info(\"Entering bedtools_merge: %s\", sample.name)\n", " mappedreads = os.path.join(data.dirs.refmapping, \n", " sample.name + \"-mapped-sorted.bam\")\n", "\n", " ## command to call `bedtools bamtobed`, and pipe output to stdout\n", " ## Usage: bedtools bamtobed [OPTIONS] -i \n", " ## Usage: bedtools merge [OPTIONS] -i \n", " cmd1 = [ip.bins.bedtools, \"bamtobed\", \"-i\", mappedreads]\n", " cmd2 = [ip.bins.bedtools, \"merge\", \"-i\", \"-\"]\n", "\n", " ## If PE the -d flag to tell bedtools how far apart to allow mate pairs.\n", " ## If SE the -d flag is negative, specifying that SE reads need to\n", " ## overlap by at least a specific number of bp. This prevents the\n", " ## stairstep syndrome when a + and - read are both extending from\n", " ## the same cutsite. Passing a negative number to `merge -d` gets this done.\n", " if 'pair' in data.paramsdict[\"datatype\"]:\n", " check_insert_size(data, sample)\n", " #cmd2.insert(2, str(data._hackersonly[\"max_inner_mate_distance\"]))\n", " cmd2.insert(2, str(data._hackersonly[\"max_inner_mate_distance\"]))\n", " cmd2.insert(2, \"-d\")\n", " else:\n", " cmd2.insert(2, str(-1 * data._hackersonly[\"min_SE_refmap_overlap\"]))\n", " cmd2.insert(2, \"-d\")\n", "\n", " ## pipe output from bamtobed into merge\n", " LOGGER.info(\"stdv: bedtools merge cmds: %s %s\", cmd1, cmd2)\n", " proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)\n", " proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc1.stdout)\n", " result = proc2.communicate()[0].decode()\n", " proc1.stdout.close()\n", "\n", " ## check for errors and do cleanup\n", " if proc2.returncode:\n", " raise IPyradWarningExit(\"error in %s: %s\", cmd2, result)\n", "\n", " ## Write the bedfile out, because it's useful sometimes.\n", " if os.path.exists(ip.__debugflag__):\n", " with open(os.path.join(data.dirs.refmapping, sample.name + \".bed\"), 'w') as outfile:\n", " outfile.write(result)\n", "\n", " ## Report the number of regions we're returning\n", " nregions = len(result.strip().split(\"\\n\"))\n", " LOGGER.info(\"bedtools_merge: Got # regions: %s\", nregions)\n", " return result" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bedtools_merge" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "\n", "def check_insert_size(data, sample):\n", " \"\"\"\n", " check mean insert size for this sample and update \n", " hackersonly.max_inner_mate_distance if need be. This value controls how \n", " far apart mate pairs can be to still be considered for bedtools merging \n", " downstream.\n", " \"\"\"\n", "\n", " ## pipe stats output to grep\n", " cmd1 = [\n", " ip.bins.samtools, \n", " \"stats\", \n", " os.path.join(\n", " data.dirs.refmapping, \"{}-mapped-sorted.bam\".format(sample.name)),\n", " ]\n", " cmd2 = [\"grep\", \"SN\"]\n", " proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)\n", " proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc1.stdout)\n", " res = proc2.communicate()[0].decode()\n", " if proc2.returncode:\n", " raise IPyradWarningExit(\"error in %s: %s\", cmd2, res)\n", " \n", " ## starting vals\n", " avg_insert = 0\n", " stdv_insert = 0\n", " avg_len = 0\n", "\n", " ## iterate over results\n", " for line in res.split(\"\\n\"):\n", " if \"insert size average\" in line:\n", " avg_insert = float(line.split(\":\")[-1].strip())\n", "\n", " elif \"insert size standard deviation\" in line:\n", " ## hack to fix sim data when stdv is 0.0. Shouldn't\n", " ## impact real data bcz stdv gets rounded up below\n", " stdv_insert = float(line.split(\":\")[-1].strip()) + 0.1\n", " \n", " elif \"average length\" in line:\n", " avg_len = float(line.split(\":\")[-1].strip())\n", "\n", " LOGGER.debug(\"avg {} stdv {} avg_len {}\"\n", " .format(avg_insert, stdv_insert, avg_len))\n", "\n", " ## If all values return successfully set the max inner mate distance.\n", " ## This is tricky. avg_insert is the average length of R1+R2+inner mate\n", " ## distance. avg_len is the average length of a read. If there are lots\n", " ## of reads that overlap then avg_insert will be close to but bigger than\n", " ## avg_len. We are looking for the right value for `bedtools merge -d`\n", " ## which wants to know the max distance between reads. \n", " if all([avg_insert, stdv_insert, avg_len]):\n", " ## If 2 * the average length of a read is less than the average\n", " ## insert size then most reads DO NOT overlap\n", " if stdv_insert < 5:\n", " stdv_insert = 5.\n", " if (2 * avg_len) < avg_insert:\n", " hack = avg_insert + (3 * np.math.ceil(stdv_insert)) - (2 * avg_len)\n", "\n", " ## If it is > than the average insert size then most reads DO\n", " ## overlap, so we have to calculate inner mate distance a little \n", " ## differently.\n", " else:\n", " hack = (avg_insert - avg_len) + (3 * np.math.ceil(stdv_insert))\n", " \n", "\n", " ## set the hackerdict value\n", " LOGGER.info(\"stdv: hacked insert size is %s\", hack)\n", " data._hackersonly[\"max_inner_mate_distance\"] = int(np.math.ceil(hack))\n", "\n", " else:\n", " ## If something fsck then set a relatively conservative distance\n", " data._hackersonly[\"max_inner_mate_distance\"] = 300\n", " LOGGER.debug(\"inner mate distance for {} - {}\".format(sample.name,\\\n", " data._hackersonly[\"max_inner_mate_distance\"]))\n", "\n" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "#from ipyrad.assemble.refmap import *\n", "data = s3.data\n", "samples = list(data.samples.values())\n", "sample = samples[0]\n", "regions = bedtools_merge(data, sample).strip().split(\"\\n\")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "nregions = len(regions)\n", "chunksize = (nregions / 10) + (nregions % 10)\n" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "from pysam import AlignmentFile\n", "sample.files.mapped_reads = os.path.join(\n", " data.dirs.refmapping, sample.name + \"-mapped-sorted.bam\")\n", "samfile = AlignmentFile(sample.files.mapped_reads, 'rb')\n", "#\"./tortas_refmapping/PZ70-mapped-sorted.bam\", \"rb\")" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'MT\\t10109\\t10200'" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "regions[1]" ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [ { "ename": "FileNotFoundError", "evalue": "[Errno 2] could not open alignment file `ex1.sam`: No such file or directory", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msamfile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpysam\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mAlignmentFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"ex1.sam\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32mpysam/libcalignmentfile.pyx\u001b[0m in \u001b[0;36mpysam.libcalignmentfile.AlignmentFile.__cinit__\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpysam/libcalignmentfile.pyx\u001b[0m in \u001b[0;36mpysam.libcalignmentfile.AlignmentFile._open\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] could not open alignment file `ex1.sam`: No such file or directory" ] } ], "source": [ "samfile = pysam.AlignmentFile(\"ex1.sam\", \"r\")" ] }, { "cell_type": "code", "execution_count": 262, "metadata": {}, "outputs": [], "source": [ "a = pysam.AlignmentFile(\"./3-refsetest_refmapping/1A_0.sam\")" ] }, { "cell_type": "code", "execution_count": 287, "metadata": {}, "outputs": [], "source": [ "nthreads = 2\n", "cmd1 = [\n", " ip.bins.bwa, \"mem\",\n", " \"-t\", str(max(1, nthreads)),\n", " \"-M\",\n", " data.paramsdict['reference_sequence'], \n", " sample.concatfiles[0][0],\n", " sample.concatfiles[0][1] or \"\",\n", " ] \n", "cmd1 = [i for i in cmd1 if i]\n", "\n", "# Insert optional flags for bwa\n", "bwa_args = data._hackersonly[\"bwa_args\"].split()\n", "bwa_args.reverse()\n", "for arg in bwa_args:\n", " cmd1.insert(2, arg)\n", "\n", "with open(sample.files.sam, 'wb') as outfile:\n", " proc1 = sps.Popen(cmd1, stderr=None, stdout=outfile)\n", " error1 = proc1.communicate()[0]\n", " if proc1.returncode:\n", " raise IPyradError(error1)" ] }, { "cell_type": "code", "execution_count": 288, "metadata": {}, "outputs": [], "source": [ "def bwa_map(data, sample, nthreads, force):\n", " \"\"\" \n", " Map reads to reference sequence. This reads in the fasta files\n", " (samples.files.edits), and maps each read to the reference. Unmapped reads \n", " are dropped right back in the de novo pipeline. \n", " Mapped reads end up in a sam file.\n", " \"\"\"\n", "\n", " sample.files.sam = os.path.join(\n", " data.dirs.refmapping, \n", " \"{}.sam\".format(sample.name))\n", "\n", " sample.files.mapped_reads = os.path.join(\n", " data.dirs.refmapping,\n", " \"{}-mapped-sorted.bam\".format(sample.name))\n", "\n", " sample.files.unmapped_bam = os.path.join(\n", " data.dirs.refmapping,\n", " \"{}-unmapped.bam\".format(sample.name))\n", "\n", " sample.files.unmapped_reads = os.path.join(\n", " data.dirs.refmapping,\n", " \"{}-unmapped.fastq\".format(sample.name))\n", "\n", "\n", " # (cmd1) bwa mem [OPTIONS] [] \n", " # -t # : Number of threads\n", " # -M : Mark split alignments as secondary.\n", "\n", " # (cmd2) samtools view [options] || [region ...] \n", " # -b = write to .bam\n", " # -q = Only keep reads with mapq score >= 30 (seems to be pretty standard)\n", " # -F = Select all reads that DON'T have these flags. \n", " # 0x4 (segment unmapped)\n", " # 0x100 (Secondary alignment)\n", " # 0x800 (supplementary alignment)\n", " # -U = Write out all reads that don't pass the -F filter \n", " # (all unmapped reads go to this file).\n", "\n", " # (cmd3) samtools sort [options...] [in.bam]\n", " # -T = Temporary file name, this is required by samtools, ignore it\n", " # Here we hack it to be samhandle.tmp cuz samtools cleans it up\n", " # -O = Output file format, in this case bam\n", " # -o = Output file name\n", "\n", " # (cmd5) samtools bam2fq -v 45 [in.bam]\n", " # -v45 set the default qscore arbirtrarily high\n", " #\n", " cmd1 = [\n", " ip.bins.bwa, \"mem\",\n", " \"-t\", str(max(1, nthreads)),\n", " \"-M\",\n", " data.paramsdict['reference_sequence'], \n", " sample.concatfiles[0][0],\n", " sample.concatfiles[0][1] or \"\",\n", " ] \n", " cmd1 = [i for i in cmd1 if i]\n", "\n", " # Insert optional flags for bwa\n", " bwa_args = data._hackersonly[\"bwa_args\"].split()\n", " bwa_args.reverse()\n", " for arg in bwa_args:\n", " cmd1.insert(2, arg)\n", "\n", " with open(sample.files.sam, 'wb') as outfile:\n", " proc1 = sps.Popen(cmd1, stderr=None, stdout=outfile)\n", " error1 = proc1.communicate()[0]\n", " if proc1.returncode:\n", " raise IPyradError(error1)\n", "\n", " # sends unmapped reads to a files and will PIPE mapped reads to cmd3\n", " cmd2 = [\n", " ip.bins.samtools, \"view\", \n", " \"-b\", \n", " \"-F\", \"0x904\",\n", " \"-U\", sample.files.unmapped_bam,\n", " sample.files.sam, \n", " ]\n", "\n", " # this is gonna catch mapped bam output from cmd2 and write to file\n", " cmd3 = [\n", " ip.bins.samtools, \"sort\", \n", " \"-T\", os.path.join(data.dirs.refmapping, sample.name + \".sam.tmp\"),\n", " \"-O\", \"bam\", \n", " \"-o\", sample.files.mapped_reads]\n", "\n", " # Later we're gonna use samtools to grab out regions using 'view'\n", " cmd4 = [ipyrad.bins.samtools, \"index\", sample.files.mapped_reads]\n", "\n", " # convert unmapped reads to fastq\n", " cmd5 = [\n", " ip.bins.samtools, \"bam2fq\",\n", " \"-v 45\",\n", " sample.files.unmapped_bam, \n", " ]\n", "\n", " # Insert additional arguments for paired data to the commands.\n", " # We assume Illumina paired end reads for the orientation \n", " # of mate pairs (orientation: ---> <----). \n", " if 'pair' in data.paramsdict[\"datatype\"]:\n", " # add samtools filter for only keep if both pairs hit\n", " # 0x1 - Read is paired\n", " # 0x2 - Each read properly aligned\n", " cmd2.insert(2, \"0x3\")\n", " cmd2.insert(2, \"-f\")\n", "\n", " # tell bam2fq that there are output files for each read pair\n", " cmd5.insert(2, os.path.join(\n", " data.dirs.edits, sample.name + \"-tmp-umap1.fastq\"))\n", " cmd5.insert(2, \"-1\")\n", " cmd5.insert(2, os.path.join(\n", " data.dirs.edits, sample.name + \"-tmp-umap2.fastq\"))\n", " cmd5.insert(2, \"-2\")\n", " else:\n", " cmd5.insert(2, sample.files.unmapped_reads)\n", " cmd5.insert(2, \"-0\")\n", "\n", "\n", " # cmd2 writes to sname.unmapped.bam and fills pipe with mapped BAM data\n", " LOGGER.debug(\" \".join(cmd2))\n", " proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)\n", "\n", " # cmd3 pulls mapped BAM from pipe and writes to sname.mapped-sorted.bam\n", " LOGGER.debug(\" \".join(cmd3))\n", " proc3 = sps.Popen(cmd3, \n", " stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc2.stdout)\n", " error3 = proc3.communicate()[0]\n", " if proc3.returncode:\n", " raise IPyradWarningExit(error3)\n", " proc2.stdout.close()\n", "\n", " # cmd4 indexes the bam file \n", " LOGGER.debug(\" \".join(cmd4))\n", " proc4 = sps.Popen(cmd4, stderr=sps.STDOUT, stdout=sps.PIPE)\n", " error4 = proc4.communicate()[0]\n", " if proc4.returncode:\n", " raise IPyradWarningExit(error4)\n", " \n", " # Running cmd5 writes to either edits/sname-refmap_derep.fastq for SE\n", " # or it makes edits/sname-tmp-umap{12}.fastq for paired data, which \n", " # will then need to be merged.\n", " LOGGER.debug(\" \".join(cmd5))\n", " proc5 = sps.Popen(cmd5, stderr=sps.STDOUT, stdout=sps.PIPE)\n", " error5 = proc5.communicate()[0]\n", " if proc5.returncode:\n", " raise IPyradWarningExit(error5)" ] }, { "cell_type": "code", "execution_count": 289, "metadata": {}, "outputs": [], "source": [ "bwa_map(data, sample, 2, 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n" ] }, { "cell_type": "code", "execution_count": 205, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(None, None)" ] }, "execution_count": 205, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cmd1 = [\n", " ip.bins.bwa, \"mem\",\n", " \"-t\", str(max(1, 2)),\n", " \"-M\",\n", " data.paramsdict['reference_sequence'], \n", " sample.concatfiles[0][0],\n", " sample.concatfiles[0][1] or \"\",\n", " ] \n", "\n", "proc1 = sps.Popen(cmd1)#, stderr=cmd1_stderr, stdout=cmd1_stdout)\n", "proc1.communicate()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pysam.sort(\"-m\", \"1000000\", \"-o\", \"output.bam\", \"ex1.bam\")" ] }, { "cell_type": "code", "execution_count": 182, "metadata": {}, "outputs": [], "source": [ "iterreg = samfile.fetch(\"MT\", 5009, 5100)\n", "iterreg = samfile.fetch(\"MT\", 10109, 10200)\n", "iterreg = samfile.fetch(\"MT\", 285510, 285600)" ] }, { "cell_type": "code", "execution_count": 181, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TGCAGACCATAATGGCATGGTGCCACAGCTAGGGACCGATCTCTTTCTAAACTACATCCCCTAGGTTGAGACCCATGCGTGCTCTAATTGG\n" ] }, { "ename": "TypeError", "evalue": "Argument 'other' has incorrect type (expected pysam.libcalignedsegment.AlignedSegment, got str)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mread\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterreg\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_reference_sequence\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mTypeError\u001b[0m: Argument 'other' has incorrect type (expected pysam.libcalignedsegment.AlignedSegment, got str)" ] } ], "source": [ "for read in iterreg:\n", " print(read.seq)\n", " print(read.compare(read.get_reference_sequence()))" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "it = samfile.pileup(\"MT\", 5009, 5100)\n", "pysam.Pileup." ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n", "[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n" ] } ], "source": [ "for read in iterreg:\n", " print(read.aligned_pairs)" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "rdict = {}\n", "for read in iterreg:\n", " if read.qname not in rdict:\n", " rdict[read.qname] = read" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCCC'" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "read.seq\n" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, 91)]" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "read.cigar" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['lane1_locus0_1B_0_0', 'lane1_locus0_1B_0_1', 'lane1_locus0_1B_0_2', 'lane1_locus0_1B_0_3', 'lane1_locus0_1B_0_4', 'lane1_locus0_1B_0_5', 'lane1_locus0_1B_0_6', 'lane1_locus0_1B_0_7', 'lane1_locus0_1B_0_8', 'lane1_locus0_1B_0_9', 'lane1_locus0_1B_0_10', 'lane1_locus0_1B_0_11', 'lane1_locus0_1B_0_12', 'lane1_locus0_1B_0_13', 'lane1_locus0_1B_0_14', 'lane1_locus0_1B_0_15', 'lane1_locus0_1B_0_16', 'lane1_locus0_1B_0_17', 'lane1_locus0_1B_0_18', 'lane1_locus0_1B_0_19', 'lane1_locus0_1B_0_20'])" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted(\n", " rdict.keys(),\n", " key=lambda x: int(x.split(\"\"))" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "it = samfile.pileup('MT', 5009, 5100)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "ename": "IndexError", "evalue": "list index out of range", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mchrom\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpos1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpos2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mregion\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mclust\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfetch_cluster_se\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msamfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchrom\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpos1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpos2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/refmap.py\u001b[0m in \u001b[0;36mfetch_cluster_se\u001b[0;34m(data, samfile, chrom, rstart, rend)\u001b[0m\n\u001b[1;32m 368\u001b[0m \u001b[0;31m## sort dict keys so highest derep is first ('seed')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 369\u001b[0m \u001b[0msfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";size=\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 370\u001b[0;31m \u001b[0mrkeys\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msorted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreverse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 371\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 372\u001b[0m \u001b[0;31m## get blocks from the seed for filtering, bail out if seed is not paired\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/refmap.py\u001b[0m in \u001b[0;36m\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 368\u001b[0m \u001b[0;31m## sort dict keys so highest derep is first ('seed')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 369\u001b[0;31m \u001b[0msfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";size=\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 370\u001b[0m \u001b[0mrkeys\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msorted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreverse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 371\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIndexError\u001b[0m: list index out of range" ] } ], "source": [ "clusts = []\n", "nclusts = 0\n", "for region in regions:\n", " chrom, pos1, pos2 = region.split()\n", " \n", " clust = fetch_cluster_se(data, samfile, chrom, int(pos1), int(pos2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "def fetch_cluster_se(data, samfile, chrom, rstart, rend):\n", " \"\"\"\n", " Builds a single end cluster from the refmapped data.\n", " \"\"\"\n", "\n", " ## If SE then we enforce the minimum overlap distance to avoid the\n", " ## staircase syndrome of multiple reads overlapping just a little.\n", " overlap_buffer = data._hackersonly[\"min_SE_refmap_overlap\"]\n", "\n", " ## the *_buff variables here are because we have to play patty\n", " ## cake here with the rstart/rend vals because we want pysam to\n", " ## enforce the buffer for SE, but we want the reference sequence\n", " ## start and end positions to print correctly for downstream.\n", " rstart_buff = rstart + overlap_buffer\n", " rend_buff = rend - overlap_buffer\n", "\n", " ## Reads that map to only very short segements of the reference\n", " ## sequence will return buffer end values that are before the\n", " ## start values causing pysam to complain. Very short mappings.\n", " if rstart_buff > rend_buff:\n", " tmp = rstart_buff\n", " rstart_buff = rend_buff\n", " rend_buff = tmp\n", " ## Buffering can't make start and end equal or pysam returns nothing.\n", " if rstart_buff == rend_buff:\n", " rend_buff += 1\n", "\n", " ## store pairs\n", " rdict = {}\n", " clust = []\n", " iterreg = []\n", "\n", " iterreg = samfile.fetch(chrom, rstart_buff, rend_buff)\n", "\n", " ## use dict to match up read pairs\n", " for read in iterreg:\n", " if read.qname not in rdict:\n", " rdict[read.qname] = read\n", "\n", " ## sort dict keys so highest derep is first ('seed')\n", " sfunc = lambda x: int(x.split(\";size=\")[1].split(\";\")[0])\n", " rkeys = sorted(rdict.keys(), key=sfunc, reverse=True)\n", "\n", " ## get blocks from the seed for filtering, bail out if seed is not paired\n", " try:\n", " read1 = rdict[rkeys[0]]\n", " except ValueError:\n", " LOGGER.error(\"Found bad cluster, skipping - key:{} rdict:{}\".format(rkeys[0], rdict))\n", " return \"\"\n", "\n", " ## the starting blocks for the seed\n", " poss = read1.get_reference_positions(full_length=True)\n", " seed_r1start = min(poss)\n", " seed_r1end = max(poss)\n", "\n", " ## store the seed -------------------------------------------\n", " if read1.is_reverse:\n", " seq = revcomp(read1.seq)\n", " else:\n", " seq = read1.seq\n", "\n", " ## store, could write orient but just + for now.\n", " size = sfunc(rkeys[0])\n", " clust.append(\">{}:{}:{};size={};*\\n{}\"\\\n", " .format(chrom, seed_r1start, seed_r1end, size, seq))\n", "\n", " ## If there's only one hit in this region then rkeys will only have\n", " ## one element and the call to `rkeys[1:]` will raise. Test for this.\n", " if len(rkeys) > 1:\n", " ## store the hits to the seed -------------------------------\n", " for key in rkeys[1:]:\n", " skip = False\n", " try:\n", " read1 = rdict[key]\n", " except ValueError:\n", " ## enter values that will make this read get skipped\n", " read1 = rdict[key][0]\n", " skip = True\n", "\n", " ## orient reads only if not skipping\n", " if not skip:\n", " poss = read1.get_reference_positions(full_length=True)\n", " minpos = min(poss)\n", " maxpos = max(poss)\n", " ## store the seq\n", " if read1.is_reverse:\n", " seq = revcomp(read1.seq)\n", " else:\n", " seq = read1.seq\n", " ## store, could write orient but just + for now.\n", " size = sfunc(key)\n", " clust.append(\">{}:{}:{};size={};+\\n{}\"\\\n", " .format(chrom, minpos, maxpos, size, seq))\n", " else:\n", " ## seq is excluded, though, we could save it and return\n", " ## it as a separate cluster that will be aligned separately.\n", " pass\n", "\n", " return clust " ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "cmd1 = [\n", " ip.bins.bwa, \"mem\",\n", " \"-t\", str(max(1, nthreads)),\n", " \"-M\",\n", " data.paramsdict['reference_sequence'], \n", " ] \n", "cmd1 += [i for i in sample.concatfiles[0] if i]\n", "\n", "# Insert optional flags for bwa\n", "bwa_args = data._hackersonly[\"bwa_args\"].split()\n", "bwa_args.reverse()\n", "for arg in bwa_args:\n", " cmd1.insert(2, arg)\n", "\n", "cmd1_stdout_handle = os.path.join(\n", " data.dirs.refmapping, sample.name + \".sam\")\n", "cmd1_stdout = open(cmd1_stdout_handle, 'w')\n", "cmd1_stderr = None\n", "\n", "proc1 = sps.Popen(cmd1, stderr=cmd1_stderr, stdout=cmd1_stdout)\n" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "error1 = proc1.communicate()[0]" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "cmd2 = [\n", " ip.bins.samtools, \"view\", \n", " \"-b\", \n", " #\"-q\", \"30\",\n", " \"-F\", \"0x904\",\n", " \"-U\", os.path.join(\n", " data.dirs.refmapping, sample.name + \"-unmapped.bam\"), \n", " os.path.join(data.dirs.refmapping, sample.name + \".sam\")]\n" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)\n", "#proc2.communicate()" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(b'', None)" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "proc3 = sps.Popen(cmd3, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc2.stdout)\n", "proc3.communicate()" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(b'', None)" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "proc4 = sps.Popen(cmd4, stderr=sps.STDOUT, stdout=sps.PIPE)\n", "proc4.communicate()" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ " sample.files.unmapped_reads = os.path.join(\n", " data.dirs.refmapping,\n", " \"{}-unmapped.fastq\".format(sample.name))\n", "\n", " \n", " cmd4 = [ip.bins.samtools, \"index\", sample.files.mapped_reads]\n", "\n", " # this is gonna read in the unmapped files, args are added below, \n", " # and it will output fastq formatted unmapped reads for merging.\n", " # -v 45 sets the default qscore arbitrarily high\n", " cmd5 = [\n", " ip.bins.samtools, \"bam2fq\",\n", " \"-v 45\",\n", " os.path.join(data.dirs.refmapping, sample.name + \"-unmapped.bam\")]\n", "\n", " # Insert additional arguments for paired data to the commands.\n", " # We assume Illumina paired end reads for the orientation \n", " # of mate pairs (orientation: ---> <----). \n", " if 'pair' in data.paramsdict[\"datatype\"]:\n", " # add samtools filter for only keep if both pairs hit\n", " # 0x1 - Read is paired\n", " # 0x2 - Each read properly aligned\n", " cmd2.insert(2, \"0x3\")\n", " cmd2.insert(2, \"-f\")\n", "\n", " # tell bam2fq that there are output files for each read pair\n", " cmd5.insert(2, os.path.join(data.dirs.edits, sample.name + \"-tmp-umap1.fastq\"))\n", " cmd5.insert(2, \"-1\")\n", " cmd5.insert(2, os.path.join(data.dirs.edits, sample.name + \"-tmp-umap2.fastq\"))\n", " cmd5.insert(2, \"-2\")\n", " else:\n", " cmd5.insert(2, sample.files.unmapped_reads)\n", " cmd5.insert(2, \"-0\")" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/home/deren/Documents/ipyrad/bin/samtools-linux-x86_64',\n", " 'bam2fq',\n", " '-0',\n", " '/home/deren/Documents/ipyrad/tests/3-refsetest_refmapping/3K_0-unmapped.fastq',\n", " '-v 45',\n", " '/home/deren/Documents/ipyrad/tests/3-refsetest_refmapping/3K_0-unmapped.bam']" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cmd5" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "proc5 = sps.Popen(cmd5, stderr=sps.STDOUT, stdout=sps.PIPE)\n", "error5 = proc5.communicate()[0]" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/home/deren/Documents/ipyrad/bin/samtools-linux-x86_64',\n", " 'sort',\n", " '-T',\n", " '/home/deren/Documents/ipyrad/tests/3-refsetest_refmapping/3K_0.sam.tmp',\n", " '-O',\n", " 'bam',\n", " '-o',\n", " '/home/deren/Documents/ipyrad/tests/3-refsetest_refmapping/3K_0-mapped-sorted.bam']" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample.files.mapped_reads = os.path.join(\n", " data.dirs.refmapping,\n", " \"{}-mapped-sorted.bam\".format(sample.name))\n", "\n", "# this is gonna catch mapped bam output from cmd2 and write to file\n", "cmd3 = [\n", " ip.bins.samtools, \"sort\", \n", " \"-T\", os.path.join(data.dirs.refmapping, sample.name + \".sam.tmp\"),\n", " \"-O\", \"bam\", \n", " \"-o\", sample.files.mapped_reads]\n", "cmd3" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "''" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dirs.refmapping" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/home/deren/Documents/ipyrad/bin/bwa-linux-x86_64',\n", " 'mem',\n", " '-t',\n", " '2',\n", " '-M',\n", " '/home/deren/Documents/ipyrad/tests/ipsimdata/rad_example_genome.fa']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nthreads = 2\n", "\n", "\n", "cmd1 = [\n", " ip.bins.bwa, \"mem\",\n", " \"-t\", str(max(1, nthreads)),\n", " \"-M\",\n", " data.paramsdict['reference_sequence']\n", " ] \n", "\n", "cmd1\n", "#cmd1 += sample.files.dereps\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:01 | dereplicating | s3 |\n", "[####################] 100% 0:00:01 | clustering/mapping | s3 |\n", "[####################] 100% 0:00:00 | building clusters | s3 |\n", "[####################] 100% 0:00:00 | chunking clusters | s3 |\n", "[####################] 100% 0:00:15 | aligning clusters | s3 |\n", "[####################] 100% 0:00:00 | concat clusters | s3 |\n" ] } ], "source": [ "s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)\n", "sample = s3.samples[0]\n", "s3.run()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "#derep_sort_map(s3.data, sample, s3.force, s3.nthreads)\n", "sample.concatfiles = concat_multiple_edits(s3.data, sample)\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('/home/deren/Documents/ipyrad/tests/2-setest_edits/3K_0.trimmed_R1_.fastq.gz',\n", " 0)]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample.mergedfile = merge_pairs(s3.data, sample, 1, 1) \n", "sample.mergedfile" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "expected str, bytes or os.PathLike object, not list", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnew_derep_and_sort\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmergedfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtmpdir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m+\u001b[0m \u001b[0;34m\"_derep.fastq\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnthreads\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36mnew_derep_and_sort\u001b[0;34m(data, infile, outfile, nthreads)\u001b[0m\n\u001b[1;32m 427\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[0;31m## build PIPEd job\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 429\u001b[0;31m \u001b[0mproc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msps\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcmd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msps\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSTDOUT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstdout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msps\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPIPE\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclose_fds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 430\u001b[0m \u001b[0merrmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommunicate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 431\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mproc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/lib/python3.6/subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors)\u001b[0m\n\u001b[1;32m 707\u001b[0m \u001b[0mc2pread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc2pwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 708\u001b[0m \u001b[0merrread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 709\u001b[0;31m restore_signals, start_new_session)\n\u001b[0m\u001b[1;32m 710\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 711\u001b[0m \u001b[0;31m# Cleanup if the child failed starting.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/lib/python3.6/subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)\u001b[0m\n\u001b[1;32m 1273\u001b[0m \u001b[0merrread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1274\u001b[0m \u001b[0merrpipe_read\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrpipe_write\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1275\u001b[0;31m restore_signals, start_new_session, preexec_fn)\n\u001b[0m\u001b[1;32m 1276\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_child_created\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1277\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mTypeError\u001b[0m: expected str, bytes or os.PathLike object, not list" ] } ], "source": [ "new_derep_and_sort(s3.data, sample.mergedfile, os.path.join(s3.data.tmpdir, sample.name+ \"_derep.fastq\"), s3.nthreads)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "strand = \"plus\"\n", "if s3.data.paramsdict[\"datatype\"] is ('gbs' or '2brad'):\n", " strand = \"both\"\n", "\n", "\n", "cmd = [\n", " ip.bins.vsearch,\n", " \"--derep_fulllength\", sample.mergedfile,\n", " \"--strand\", strand,\n", " \"--output\", outfile,\n", " \"--threads\", str(nthreads),\n", " \"--fasta_width\", str(0),\n", " \"--fastq_qmax\", \"1000\",\n", " \"--sizeout\", \n", " \"--relabel_md5\",\n", " ]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Assembly: pairtest\n", "[####################] 100% 0:00:00 | inferring [H, E] | s4 |\n", "\n", " Encountered an unexpected error (see ./ipyrad_log.txt)\n", " Error message is below -------------------------------\n", "The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()\n" ] } ], "source": [ "s3.data.run(\"4\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:01 | dereplicating | s3 |\n" ] } ], "source": [ "s3.remote_run_dereps()\n", "#s3.remote_run_cluster_map_build()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "for sample in s3.samples:\n", " cluster(s3.data, sample, s3.nthreads, s3.force)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "int() argument must be a string, a bytes-like object or a number, not 'list'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msample\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msamples\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mbuild_clusters\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmaxindels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36mbuild_clusters\u001b[0;34m(data, sample, maxindels)\u001b[0m\n\u001b[1;32m 872\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 873\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";size=\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 874\u001b[0;31m reverse=True) \n\u001b[0m\u001b[1;32m 875\u001b[0m \u001b[0mseqlist\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfseqs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 876\u001b[0m \u001b[0mseqsize\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36m\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m 871\u001b[0m fseqs = [fseqs[0]] + sorted(fseqs[1:], \n\u001b[1;32m 872\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 873\u001b[0;31m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";size=\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 874\u001b[0m reverse=True) \n\u001b[1;32m 875\u001b[0m \u001b[0mseqlist\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfseqs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mTypeError\u001b[0m: int() argument must be a string, a bytes-like object or a number, not 'list'" ] } ], "source": [ "for sample in s3.samples:\n", " build_clusters(s3.data, sample, s3.maxindels)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "for sample in s3.samples:\n", " muscle_chunker(s3.data, sample)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "aasyncs = {}\n", "for sample in s3.samples:\n", " aasyncs[sample.name] = []\n", " for idx in range(10):\n", " handle = os.path.join(s3.data.tmpdir, \n", " \"{}_chunk_{}.ali\".format(sample.name, idx))\n", " align_and_parse(handle, s3.maxindels, s3.gbs)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "def _get_derep_num(read):\n", " \"return the number of replicates in a derep read\"\n", " return int(read.split(\"=\")[-1].split(\"\\n\")[0][:-1])\n", "\n", "\n", "\n", "\n", "def _aligned_indel_filter(clust, max_internal_indels):\n", " \"\"\" checks for too many internal indels in muscle aligned clusters \"\"\"\n", "\n", " ## make into list\n", " lclust = clust.split()\n", " \n", " ## paired or not\n", " try:\n", " seq1 = [i.split(\"nnnn\")[0] for i in lclust[1::2]]\n", " seq2 = [i.split(\"nnnn\")[1] for i in lclust[1::2]]\n", " intindels1 = [i.rstrip(\"-\").lstrip(\"-\").count(\"-\") for i in seq1]\n", " intindels2 = [i.rstrip(\"-\").lstrip(\"-\").count(\"-\") for i in seq2]\n", " intindels = intindels1 + intindels2\n", " if max(intindels) > max_internal_indels:\n", " return 1\n", " except IndexError:\n", " seq1 = lclust[1::2]\n", " intindels = [i.rstrip(\"-\").lstrip(\"-\").count(\"-\") for i in seq1]\n", " if max(intindels) > max_internal_indels:\n", " return 1 \n", " return 0\n" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "for sample in s3.samples:\n", " reconcat(s3.data, sample)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "\n", "def align_and_parse(handle, max_internal_indels=5, is_gbs=False):\n", " \"\"\" much faster implementation for aligning chunks \"\"\"\n", "\n", " ## data are already chunked, read in the whole thing. bail if no data.\n", " try:\n", " with open(handle, 'rb') as infile:\n", " clusts = infile.read().decode().split(\"//\\n//\\n\")\n", " # remove any empty spots\n", " clusts = [i for i in clusts if i]\n", " # Skip entirely empty chunks\n", " if not clusts:\n", " raise IPyradError(\"no clusters in file: {}\".format(handle))\n", "\n", " except (IOError, IPyradError):\n", " LOGGER.debug(\"skipping empty chunk - {}\".format(handle))\n", " return 0\n", "\n", " ## count discarded clusters for printing to stats later\n", " highindels = 0\n", "\n", " ## iterate over clusters sending each to muscle, splits and aligns pairs\n", " aligned = _persistent_popen_align3(clusts, 200, is_gbs)\n", "\n", " ## store good alignments to be written to file\n", " refined = []\n", "\n", " ## filter and trim alignments\n", " for clust in aligned:\n", " # check for too many internal indels\n", " if not _aligned_indel_filter(clust, max_internal_indels):\n", " refined.append(clust)\n", " else:\n", " highindels += 1\n", "\n", " ## write to file after\n", " if refined:\n", " outhandle = handle.rsplit(\".\", 1)[0] + \".aligned\"\n", " with open(outhandle, 'wb') as outfile:\n", " outfile.write(str.encode(\"\\n//\\n//\\n\".join(refined) + \"\\n\"))\n", "\n", " ## remove the old tmp file\n", " if not LOGGER.getEffectiveLevel() == 10:\n", " os.remove(handle)\n", " return highindels" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "def _persistent_popen_align3(clusts, maxseqs=200, is_gbs=False):\n", " \"\"\" keeps a persistent bash shell open and feeds it muscle alignments \"\"\"\n", "\n", " ## create a separate shell for running muscle in, this is much faster\n", " ## than spawning a separate subprocess for each muscle call\n", " proc = sps.Popen(\n", " [\"bash\"], \n", " stdin=sps.PIPE, \n", " stdout=sps.PIPE, \n", " bufsize=0,\n", " )\n", "\n", " ## iterate over clusters in this file until finished\n", " aligned = []\n", " for clust in clusts:\n", "\n", " ## new alignment string for read1s and read2s\n", " align1 = []\n", " align2 = []\n", "\n", " ## don't bother aligning if only one seq\n", " if clust.count(\">\") == 1:\n", " aligned.append(clust.replace(\">\", \"\").strip())\n", " else:\n", "\n", " # do we need to split the alignment? (is there a PE insert?)\n", " try:\n", " # make into list (only read maxseqs lines, 2X cuz names)\n", " lclust = clust.split()[:maxseqs * 2]\n", "\n", " # try to split cluster list at nnnn separator for each read\n", " lclust1 = list(chain(*zip(\n", " lclust[::2], [i.split(\"nnnn\")[0] for i in lclust[1::2]])))\n", " lclust2 = list(chain(*zip(\n", " lclust[::2], [i.split(\"nnnn\")[1] for i in lclust[1::2]])))\n", "\n", " # put back into strings\n", " clust1 = \"\\n\".join(lclust1)\n", " clust2 = \"\\n\".join(lclust2)\n", "\n", " # Align the first reads.\n", " # The muscle command with alignment as stdin and // as split\n", " cmd1 = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n", " .format(clust1, ip.bins.muscle, \"//\\n\"))\n", "\n", " # send cmd1 to the bash shell\n", " proc.stdin.write(cmd1.encode())\n", "\n", " # read the stdout by line until splitter is reached\n", " # meaning that the alignment is finished.\n", " for line in iter(proc.stdout.readline, b'//\\n'):\n", " align1.append(line.decode())\n", "\n", " # Align the second reads.\n", " # The muscle command with alignment as stdin and // as split\n", " cmd2 = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n", " .format(clust2, ip.bins.muscle, \"//\\n\"))\n", "\n", " # send cmd2 to the bash shell\n", " proc.stdin.write(cmd2.encode())\n", "\n", " # read the stdout by line until splitter is reached\n", " # meaning that the alignment is finished.\n", " for line in iter(proc.stdout.readline, b'//\\n'):\n", " align2.append(line.decode())\n", "\n", " # join up aligned read1 and read2 and ensure names order match\n", " lines1 = \"\".join(align1)[1:].split(\"\\n>\")\n", " lines2 = \"\".join(align2)[1:].split(\"\\n>\")\n", " dalign1 = dict([i.split(\"\\n\", 1) for i in lines1])\n", " dalign2 = dict([i.split(\"\\n\", 1) for i in lines2])\n", "\n", " # sort the first reads\n", " keys = list(dalign1.keys())\n", " seed = [i for i in keys if i[-1] == \"*\"][0]\n", " keys.pop(keys.index(seed))\n", " order = [seed] + sorted(\n", " keys, key=_get_derep_num, reverse=True) \n", "\n", " # combine in order\n", " alignpe = [] \n", " for key in order:\n", " alignpe.append(\"\\n\".join([\n", " key, \n", " dalign1[key].replace(\"\\n\", \"\") + \"nnnn\" + \\\n", " dalign2[key].replace(\"\\n\", \"\")]))\n", "\n", " ## append aligned cluster string\n", " aligned.append(\"\\n\".join(alignpe).strip())\n", "\n", " # Malformed clust. Dictionary creation with only 1 element \n", " except ValueError as inst:\n", " ip.logger.debug(\n", " \"Bad PE cluster - {}\\nla1 - {}\\nla2 - {}\"\n", " .format(clust, lines1, lines2))\n", "\n", " ## Either reads are SE, or at least some pairs are merged.\n", " except IndexError:\n", " \n", " # limit the number of input seqs\n", " # use lclust already built before checking pairs\n", " lclust = \"\\n\".join(clust.split()[:maxseqs * 2])\n", "\n", " # the muscle command with alignment as stdin and // as splitter\n", " cmd = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n", " .format(lclust, ip.bins.muscle, \"//\\n\"))\n", "\n", " ## send cmd to the bash shell (TODO: PIPE could overflow here!)\n", " proc.stdin.write(cmd.encode())\n", "\n", " ## read the stdout by line until // is reached. This BLOCKS.\n", " for line in iter(proc.stdout.readline, b'//\\n'):\n", " align1.append(line.decode())\n", "\n", " ## remove '>' from names, and '\\n' from inside long seqs \n", " lines = \"\".join(align1)[1:].split(\"\\n>\")\n", "\n", " ## find seed of the cluster and put it on top.\n", " seed = [i for i in lines if i.split(\";\")[-1][0] == \"*\"][0]\n", " lines.pop(lines.index(seed))\n", " lines = [seed] + sorted(\n", " lines, key=_get_derep_num, reverse=True)\n", "\n", " ## format remove extra newlines from muscle\n", " aa = [i.split(\"\\n\", 1) for i in lines]\n", " align1 = [i[0] + '\\n' + \"\".join([j.replace(\"\\n\", \"\") \n", " for j in i[1:]]) for i in aa]\n", " \n", " # trim edges in sloppy gbs/ezrad data. \n", " # Maybe relevant to other types too...\n", " if is_gbs:\n", " align1 = _gbs_trim(align1)\n", "\n", " ## append to aligned\n", " aligned.append(\"\\n\".join(align1))\n", " \n", " # cleanup\n", " proc.stdout.close()\n", " if proc.stderr:\n", " proc.stderr.close()\n", " proc.stdin.close()\n", " proc.wait()\n", "\n", " ## return the aligned clusters\n", " return aligned " ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "invalid literal for int() with base 10: 'f72cb70f788295d6632d5d18442'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msample\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msamples\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mip\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massemble\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclustmap\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sample_cleanup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36m_sample_cleanup\u001b[0;34m(data, sample)\u001b[0m\n\u001b[1;32m 1262\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1263\u001b[0m \u001b[0;31m# get maxlen and depths array from clusters\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1264\u001b[0;31m \u001b[0mmaxlens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdepths\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_quick_depths\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1265\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1266\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36m_get_quick_depths\u001b[0;34m(data, sample)\u001b[0m\n\u001b[1;32m 1250\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1251\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1252\u001b[0;31m \u001b[0mtdepth\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1253\u001b[0m \u001b[0mtlen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1254\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: 'f72cb70f788295d6632d5d18442'" ] } ], "source": [ "for sample in s3.samples:\n", " ip.assemble.clustmap._sample_cleanup(s3.data, sample)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "invalid literal for int() with base 10: 'f72cb70f788295d6632d5d18442'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mip\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massemble\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclustmap\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_quick_depths\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36m_get_quick_depths\u001b[0;34m(data, sample)\u001b[0m\n\u001b[1;32m 1250\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1251\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1252\u001b[0;31m \u001b[0mtdepth\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1253\u001b[0m \u001b[0mtlen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1254\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: 'f72cb70f788295d6632d5d18442'" ] } ], "source": [ "ip.assemble.clustmap._get_quick_depths(s3.data, sample)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "if sample.files.get('clusters'):\n", " pass" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "fclust = data.samples[sample.name].files.clusters\n", "clusters = gzip.open(fclust, 'rt')\n", "pairdealer = izip(*[iter(clusters)] * 2)\n", "\n", "## storage\n", "depths = []\n", "maxlen = []\n", "\n", "## start with cluster 0\n", "tdepth = 0\n", "tlen = 0\n", "\n", "## iterate until empty\n", "while 1:\n", " ## grab next\n", " try:\n", " name, seq = next(pairdealer)\n", " except StopIteration:\n", " break\n", "\n", " ## if not the end of a cluster\n", " #print name.strip(), seq.strip()\n", " #print(name)\n", " if name.strip() == seq.strip():\n", " depths.append(tdepth)\n", " maxlen.append(tlen)\n", " tlen = 0\n", " tdepth = 0\n", "\n", " else:\n", " tdepth += int(name.strip().split(\"=\")[-1][:-1])\n", " tlen = len(seq)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'1D_0'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample.name\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:02 | dereplicating | s3 |\n" ] } ], "source": [ "s3.remote_run_dereps()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:05 | clustering/mapping | s3 |\n", "[####################] 100% 0:00:00 | building clusters | s3 |\n", "[####################] 100% 0:00:00 | chunking clusters | s3 |\n" ] } ], "source": [ "s3.remote_run_cluster_map_build()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:02 | aligning clusters | s3 |\n" ] }, { "ename": "IPyradError", "evalue": "TypeError(a bytes-like object is required, not 'str')", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIPyradError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremote_run_align_cleanup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36mremote_run_align_cleanup\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mjob\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mallasyncs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 334\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msuccessful\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 335\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mIPyradError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 336\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0;31m# track job progress\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIPyradError\u001b[0m: TypeError(a bytes-like object is required, not 'str')" ] } ], "source": [ "s3.remote_run_align_cleanup()" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "handle = \"pairtest-tmpalign/1A_0_chunk_0.ali\"\n", "with open(handle, 'rb') as infile:\n", " clusts = infile.read().decode().split(\"//\\n//\\n\")\n", " # remove any empty spots\n", " clusts = [i for i in clusts if i]\n", " # Skip entirely empty chunks\n", " if not clusts:\n", " raise IPyradError(\"no clusters in file: {}\".format(handle))\n" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "maxseqs = 200\n", "is_gbs = False" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "proc = sps.Popen(\n", " [\"bash\"], \n", " stdin=sps.PIPE, \n", " stdout=sps.PIPE, \n", " bufsize=0,\n", " )\n", "\n", "## iterate over clusters in this file until finished\n", "aligned = []\n", "for clust in clusts:\n", "\n", " ## new alignment string for read1s and read2s\n", " align1 = []\n", " align2 = []\n", "\n", " ## don't bother aligning if only one seq\n", " if clust.count(\">\") == 1:\n", " aligned.append(clust.replace(\">\", \"\").strip())\n", " else:\n", "\n", " # do we need to split the alignment? (is there a PE insert?)\n", " try:\n", " # make into list (only read maxseqs lines, 2X cuz names)\n", " lclust = clust.split()[:maxseqs * 2]\n", "\n", " # try to split cluster list at nnnn separator for each read\n", " lclust1 = list(chain(*zip(\n", " lclust[::2], [i.split(\"nnnn\")[0] for i in lclust[1::2]])))\n", " lclust2 = list(chain(*zip(\n", " lclust[::2], [i.split(\"nnnn\")[1] for i in lclust[1::2]])))\n", "\n", " # put back into strings\n", " clust1 = \"\\n\".join(lclust1)\n", " clust2 = \"\\n\".join(lclust2)\n", "\n", " # Align the first reads.\n", " # The muscle command with alignment as stdin and // as split\n", " cmd1 = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n", " .format(clust1, ip.bins.muscle, \"//\\n\"))\n", "\n", " # send cmd1 to the bash shell\n", " proc.stdin.write(cmd1.encode())\n", "\n", " # read the stdout by line until splitter is reached\n", " # meaning that the alignment is finished.\n", " for line in iter(proc.stdout.readline, '//\\n'):\n", " align1.append(line)\n", "\n", " # Align the second reads.\n", " # The muscle command with alignment as stdin and // as split\n", " cmd2 = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n", " .format(clust2, ip.bins.muscle, \"//\\n\"))\n", "\n", " # send cmd2 to the bash shell\n", " proc.stdin.write(cmd2.encode())\n", "\n", " # read the stdout by line until splitter is reached\n", " # meaning that the alignment is finished.\n", " for line in iter(proc.stdout.readline, b'//\\n'):\n", " align2.append(line)\n", "\n", " # join up aligned read1 and read2 and ensure names order match\n", " lines1 = \"\".join(align1)[1:].split(\"\\n>\")\n", " lines2 = \"\".join(align2)[1:].split(\"\\n>\")\n", " dalign1 = dict([i.split(\"\\n\", 1) for i in lines1])\n", " dalign2 = dict([i.split(\"\\n\", 1) for i in lines2])\n", "\n", " # sort the first reads\n", " keys = list(dalign1.keys())\n", " seed = [i for i in keys if i[-1] == \"*\"][0]\n", " keys.pop(keys.index(seed))\n", " order = [seed] + sorted(\n", " keys, key=_get_derep_num, reverse=True) \n", "\n", " # combine in order\n", " for key in order:\n", " align1.append(\"\\n\".join([\n", " key, \n", " dalign1[key].replace(\"\\n\", \"\") + \"nnnn\" + \\\n", " dalign2[key].replace(\"\\n\", \"\")]))\n", "\n", " ## append aligned cluster string\n", " aligned.append(\"\\n\".join(align1).strip())\n", "\n", " # Malformed clust. Dictionary creation with only 1 element \n", " except ValueError as inst:\n", " ip.logger.debug(\n", " \"Bad PE cluster - {}\\nla1 - {}\\nla2 - {}\"\n", " .format(clust, lines1, lines2))\n", "\n", " ## Either reads are SE, or at least some pairs are merged.\n", " except IndexError:\n", "\n", " # limit the number of input seqs\n", " # use lclust already built before checking pairs\n", " lclust = \"\\n\".join(clust.split()[:maxseqs * 2])\n", "\n", " # the muscle command with alignment as stdin and // as splitter\n", " cmd = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n", " .format(lclust, ip.bins.muscle, \"//\\n\"))\n", "\n", " ## send cmd to the bash shell (TODO: PIPE could overflow here!)\n", " proc.stdin.write(cmd.encode())\n", "\n", " ## read the stdout by line until // is reached. This BLOCKS.\n", " for line in iter(proc.stdout.readline, b'//\\n'):\n", " align1.append(line.decode())\n", "\n", " ## remove '>' from names, and '\\n' from inside long seqs \n", " lines = \"\".join(align1)[1:].split(\"\\n>\")\n", "\n", " ## find seed of the cluster and put it on top.\n", " seed = [i for i in lines if i.split(\";\")[-1][0] == \"*\"][0]\n", " lines.pop(lines.index(seed))\n", " lines = [seed] + sorted(\n", " lines, key=_get_derep_num, reverse=True)\n", "\n", " ## format remove extra newlines from muscle\n", " aa = [i.split(\"\\n\", 1) for i in lines]\n", " align1 = [i[0] + '\\n' + \"\".join([j.replace(\"\\n\", \"\") \n", " for j in i[1:]]) for i in aa]\n", "\n", " # trim edges in sloppy gbs/ezrad data. \n", " # Maybe relevant to other types too...\n", " if is_gbs:\n", " align1 = _gbs_trim(align1)\n", "\n", " ## append to aligned\n", " aligned.append(\"\\n\".join(align1))\n", "\n", "# cleanup\n", "proc.stdout.close()\n", "if proc.stderr:\n", " proc.stderr.close()\n", "proc.stdin.close()\n", "proc.wait()\n", "\n", "## return the aligned clusters\n", "#return aligned " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "from ipyrad.assemble.clustmap import _get_derep_num" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "# join up aligned read1 and read2 and ensure names order match\n", "lines1 = \"\".join(align1)[1:].split(\"\\n>\")\n", "lines2 = \"\".join(align2)[1:].split(\"\\n>\")\n", "dalign1 = dict([i.split(\"\\n\", 1) for i in lines1])\n", "dalign2 = dict([i.split(\"\\n\", 1) for i in lines2])\n", "\n", "# sort the first reads\n", "keys = list(dalign1.keys())\n", "seed = [i for i in keys if i[-1] == \"*\"][0]\n", "keys.pop(keys.index(seed))\n", "order = [seed] + sorted(\n", " keys, key=_get_derep_num, reverse=True) " ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ " # join up aligned read1 and read2 and ensure names order match\n", " lines1 = \"\".join(align1)[1:].split(\"\\n>\")\n", " lines2 = \"\".join(align2)[1:].split(\"\\n>\")\n", " dalign1 = dict([i.split(\"\\n\", 1) for i in lines1])\n", " dalign2 = dict([i.split(\"\\n\", 1) for i in lines2])\n", "\n", " # sort the first reads\n", " keys = list(dalign1.keys())\n", " seed = [i for i in keys if i[-1] == \"*\"][0]\n", " keys.pop(keys.index(seed))\n", " order = [seed] + sorted(\n", " keys, key=_get_derep_num, reverse=True) \n", "\n", " # combine in order\n", " for key in order:\n", " align1.append(\"\\n\".join([\n", " key, \n", " dalign1[key].replace(\"\\n\", \"\") + \"nnnn\" + \\\n", " dalign2[key].replace(\"\\n\", \"\")]))\n", "\n", " ## append aligned cluster string\n", " aligned.append(\"\\n\".join(align1).strip())" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['>12f1ffdaa3a3d7c8310998dea05a56dd;size=22;*\\n\\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\\n\\nTATATACAAAGACGGATGTGGTGCGAAATAC\\n\\n>1f4df54209f2d8d9be6019fde95d7579;size=1;+\\n\\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\\n\\nTATATACAAAGACGGATGTGGTGCGAAATAC\\n\\n>4b85ee58466075729a68837e2b016ad7;size=1;+\\n\\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\\n\\nTATATACAAAGACGGATGTGGTGCGAAATAC\\n\\n12f1ffdaa3a3d7c8310998dea05a56dd;size=22;*\\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGTTATATACAAAGACGGATGTGGTGCGAAATACnnnnGGCTCCTATTTCAAGTACCGTCTAATGTCAATAAGATGGTTTCGATGCGTGGAGAGAAACCCACTCTGAACGTCCCGATCACAGCGTTGGCTCTACTCCG\\n1f4df54209f2d8d9be6019fde95d7579;size=1;+\\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGTTATATACAAAGACGGATGTGGTGCGAAATACnnnnGGCTCCTATTTCAAGTACCGTCTAATGTCAATAAGATGGTTTCGATGCGTGGAGAGAAACCCACTCTGAACGTCCCGATCACAGCGTTCGCTCTACTCCG\\n4b85ee58466075729a68837e2b016ad7;size=1;+\\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGTTATATACAAAGACGGATGTGGTGCGAAATACnnnnGGCTCCTATTTCAAGTACCGTCTAATGTCAATAAGATGGTTTCGATGCGTGGAGAGAAACCCACTCTAAACGTCCCGATCACAGCGTTGGCTCTACTCCG']" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "aligned" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "## remove '>' from names, and '\\n' from inside long seqs \n", "lines1 = \"\".join(align1)[1:].split(\"\\n>\")\n", "seed = [i for i in lines1 if i.split(\";\")[-1][0] == \"*\"][0]\n", "lines1.pop(lines1.index(seed))\n", "lines1 = [seed] + sorted(\n", " lines1, key=_get_derep_num, reverse=True)\n", "dalign1 = dict([i.split(\"\\n\", 1) for i in lines1])\n", "\n", "\n", "lines2 = \"\".join(align2)[1:].split(\"\\n>\")\n", "dalign2 = dict([i.split(\"\\n\", 1) for i in lines2])\n", "#seed = [i for i in lines2 if i.split(\";\")[-1][0] == \"*\"][0]\n", "#lines2.pop(lines2.index(seed))\n", "\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "## format remove extra newlines from muscle\n", "aa = [i.split(\"\\n\", 1) for i in lines]\n", "align1 = [i[0] + '\\n' + \"\".join([j.replace(\"\\n\", \"\") \n", " for j in i[1:]]) for i in aa]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['>12f1ffdaa3a3d7c8310998dea05a56dd;size=22;*\\n',\n", " 'TGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\\n',\n", " 'TATATACAAAGACGGATGTGGTGCGAAATAC\\n',\n", " '>1f4df54209f2d8d9be6019fde95d7579;size=1;+\\n',\n", " 'TGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\\n',\n", " 'TATATACAAAGACGGATGTGGTGCGAAATAC\\n',\n", " '>4b85ee58466075729a68837e2b016ad7;size=1;+\\n',\n", " 'TGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\\n',\n", " 'TATATACAAAGACGGATGTGGTGCGAAATAC\\n']" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "align1" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "derephandle = os.path.join(data.tmpdir, sample.name + \"_derep.fastq\")\n", "uhandle = os.path.join(data.dirs.clusts, sample.name + \".utemp\")\n", "temphandle = os.path.join(data.dirs.clusts, sample.name + \".htemp\")\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "derepfile = os.path.join(data.tmpdir, sample.name + \"_derep.fastq\")\n", "uhandle = os.path.join(data.dirs.clusts, sample.name + \".utemp\")\n", "usort = os.path.join(data.dirs.clusts, sample.name + \".utemp.sort\")\n", "hhandle = os.path.join(data.dirs.clusts, sample.name + \".htemp\")\n", "sample.files.clusters = os.path.join(\n", " data.dirs.clusts, sample.name + \".clust.gz\")\n", "clustsout = gzip.open(sample.files.clusters, 'wt')\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "cmd = [\"sort\", \"-k\", \"2\", uhandle, \"-o\", usort]\n", "proc = sps.Popen(cmd, close_fds=True)\n", "proc.communicate()[0]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "alldereps = {}\n", "with open(derepfile, 'rt') as ioderep:\n", " dereps = izip(*[iter(ioderep)] * 2)\n", " for namestr, seq in dereps:\n", " nnn, sss = [i.strip() for i in (namestr, seq)] \n", " alldereps[nnn[1:]] = sss" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "invalid literal for int() with base 10: '1+\\nTGCAGGTCACTTTTCAAGATACACTATTGTTATTACTGTGAGACACAAAGCTAATTCATCACTTCACGGATACCGCGTCCTCCTATAACGCnnnnCAATATTAACGCGTGAGTACCGGTTTCCTTGTGAGGAAGGCCCACTCTCAGTACCACCCTTATCCTATTCTAAGGCACACATGCATAGACCACTCAACCG", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 25\u001b[0m fseqs = [fseqs[0]] + sorted(fseqs[1:], \n\u001b[1;32m 26\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m int(x.split(\";size=\")[1].split(\";\")[0]), reverse=True) \n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0mseqlist\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfseqs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0mseqsize\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m 25\u001b[0m fseqs = [fseqs[0]] + sorted(fseqs[1:], \n\u001b[1;32m 26\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m int(x.split(\";size=\")[1].split(\";\")[0]), reverse=True) \n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0mseqlist\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfseqs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0mseqsize\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: '1+\\nTGCAGGTCACTTTTCAAGATACACTATTGTTATTACTGTGAGACACAAAGCTAATTCATCACTTCACGGATACCGCGTCCTCCTATAACGCnnnnCAATATTAACGCGTGAGTACCGGTTTCCTTGTGAGGAAGGCCCACTCTCAGTACCACCCTTATCCTATTCTAAGGCACACATGCATAGACCACTCAACCG" ] } ], "source": [ "seedsseen = set()\n", "maxindels = 8\n", "\n", "## Iterate through the usort file grabbing matches to build clusters\n", "with open(usort, 'rt') as insort:\n", " ## iterator, seed null, seqlist null\n", " isort = iter(insort)\n", " lastseed = 0\n", " fseqs = []\n", " seqlist = []\n", " seqsize = 0\n", " while 1:\n", " ## grab the next line\n", " try:\n", " hit, seed, _, ind, ori, _ = next(isort).strip().split()\n", " except StopIteration:\n", " break\n", "\n", " ## same seed, append match\n", " if seed != lastseed:\n", " seedsseen.add(seed)\n", " ## store the last cluster (fseq), count it, and clear fseq\n", " if fseqs:\n", " ## sort fseqs by derep after pulling out the seed\n", " fseqs = [fseqs[0]] + sorted(fseqs[1:], \n", " key=lambda x: \n", " int(x.split(\";size=\")[1].split(\";\")[0]), reverse=True) \n", " seqlist.append(\"\\n\".join(fseqs))\n", " seqsize += 1\n", " fseqs = []\n", "\n", " # occasionally write/dump stored clusters to file and clear mem\n", " if not seqsize % 10000:\n", " if seqlist:\n", " clustsout.write(\n", " \"\\n//\\n//\\n\".join(seqlist) + \"\\n//\\n//\\n\")\n", " ## reset list and counter\n", " seqlist = []\n", "\n", " ## store the new seed on top of fseq list\n", " fseqs.append(\">{}*\\n{}\".format(seed, alldereps[seed]))\n", " lastseed = seed\n", "\n", " ## add match to the seed\n", " ## revcomp if orientation is reversed (comp preserves nnnn)\n", " if ori == \"-\":\n", " seq = comp(alldereps[hit])[::-1]\n", " else:\n", " seq = alldereps[hit]\n", " ## only save if not too many indels\n", " if int(ind) <= maxindels:\n", " fseqs.append(\">{}{}\\n{}\".format(hit, ori, seq))\n", " else:\n", " ip.logger.info(\"filtered by maxindels: %s %s\", ind, seq)\n", "\n", "## write whatever is left over to the clusts file\n", "if fseqs:\n", " seqlist.append(\"\\n\".join(fseqs))\n", "if seqlist:\n", " clustsout.write(\"\\n//\\n//\\n\".join(seqlist) + \"\\n//\\n//\\n\")\n", "\n", "## now write the seeds that had no hits. Make dict from htemp\n", "with open(hhandle, 'rt') as iotemp:\n", " nohits = izip(*[iter(iotemp)] * 2)\n", " seqlist = []\n", " seqsize = 0\n", " while 1:\n", " try:\n", " nnn, _ = [i.strip() for i in next(nohits)]\n", " except StopIteration:\n", " break\n", "\n", " ## occasionally write to file\n", " if not seqsize % 10000:\n", " if seqlist:\n", " clustsout.write(\"\\n//\\n//\\n\".join(seqlist) + \"\\n//\\n//\\n\")\n", " ## reset list and counter\n", " seqlist = []\n", "\n", " ## append to list if new seed\n", " if nnn[1:] not in seedsseen:\n", " seqlist.append(\"{}*\\n{}\".format(nnn, alldereps[nnn[1:]]))\n", " seqsize += 1\n", "\n", "## write whatever is left over to the clusts file\n", "if seqlist:\n", " clustsout.write(\"\\n//\\n//\\n\".join(seqlist))\n", "\n", "## close the file handle\n", "clustsout.close()\n", "del alldereps" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'00519cc27c6ab8f71dcdef028ad184e8;size=12',\n", " '00a2c3fa80127d8adfc783464de05df4;size=10'}" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "seedsseen" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "sample.concatfiles = concat_multiple_edits(data, sample)\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "sample.mergedfile = merge_pairs(data, sample, 1, 1) \n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "new_derep_and_sort(\n", " data,\n", " sample.mergedfile,\n", " os.path.join(data.tmpdir, sample.name + \"_derep.fastq\"),\n", " 2)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "data = data\n", "sample = sample\n", "revcomp = 1\n", "vsearch_merge = 1\n", "\n", "sample.concatfiles = concat_multiple_edits(data, sample)\n", "sample.mergefile = os.path.join(data.tmpdir, sample.name + \"_merged_.fastq\") " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "sample.mergefile = os.path.join(data.tmpdir, sample.name + \"_merged_.fastq\") \n", "if 'pair' in data.paramsdict['datatype']:\n", " if \"reference\" not in data.paramsdict[\"assembly_method\"]:\n", " nmerged = ip.assemble.clustmap._merge_pairs(data, sample, 1, 1)\n", " else:\n", " nmerged = 0 # _merge_pairs(data, sample, 0, 0) \n", " sample.stats.reads_merged = nmerged" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "new_derep_and_sort(data, sampl)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def new_derep_and_sort(data, infile, outfile, nthreads):\n", " \"\"\"\n", " Dereplicates reads and sorts so reads that were highly replicated are at\n", " the top, and singletons at bottom, writes output to derep file. Paired\n", " reads are dereplicated as one concatenated read and later split again.\n", " Updated this function to take infile and outfile to support the double\n", " dereplication that we need for 3rad (5/29/15 iao).\n", " \"\"\"\n", " ## datatypes options\n", " strand = \"plus\"\n", " if data.paramsdict[\"datatype\"] is ('gbs' or '2brad'):\n", " strand = \"both\"\n", "\n", " ## do dereplication with vsearch\n", " cmd = [\n", " ip.bins.vsearch,\n", " \"--derep_fulllength\", infile,\n", " \"--strand\", strand,\n", " \"--output\", outfile,\n", " \"--threads\", str(nthreads),\n", " \"--fasta_width\", str(0),\n", " \"--fastq_qmax\", \"1000\",\n", " \"--sizeout\", \n", " \"--relabel_md5\",\n", " ]\n", " ip.logger.info(\"derep cmd %s\", cmd)\n", "\n", " ## build PIPEd job\n", " proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)\n", " errmsg = proc.communicate()[0]\n", " if proc.returncode:\n", " ip.logger.error(\"error inside derep_and_sort %s\", errmsg)\n", " raise IPyradWarningExit(errmsg)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "new_derep_and_sort()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "## CONCAT FILES FOR MERGED ASSEMBLIES\n", "mergefile = os.path.join(data.tmpdir, sample.name + \"_merged_.fastq\")\n", "sample.files.edits = concat_multiple_edits(data, sample)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "ename": "IndexError", "evalue": "tuple index out of range", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfiles\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0medits\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmergefile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m nmerged = ip.assemble.clustmap._merge_pairs(\n\u001b[0;32m----> 3\u001b[0;31m data, sample.files.edits, mergefile, 1, 1)\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreads_merged\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnmerged\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36m_merge_pairs\u001b[0;34m(data, two_files, merged_out, revcomp, merge)\u001b[0m\n\u001b[1;32m 693\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 694\u001b[0m \u001b[0mtmp1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtwo_files\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 695\u001b[0;31m \u001b[0mtmp2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtwo_files\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 696\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 697\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIndexError\u001b[0m: tuple index out of range" ] } ], "source": [ "sample.files.edits = [(mergefile, )]\n", "nmerged = ip.assemble.clustmap._merge_pairs(\n", " data, sample.files.edits, mergefile, 1, 1)\n", "sample.stats.reads_merged = nmerged" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/home/deren/Documents/ipyrad/pairtest-tmpalign/2E_0_merged_.fastq'" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mergefile" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('/home/deren/Documents/ipyrad/pairtest-tmpalign/2E_0_merged_.fastq',)]" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample.files.edits" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[####################] 100% 0:00:00 | dereplicating | s3 |\n", "[####################] 100% 0:00:01 | clustering/mapping | s3 |\n", "[####################] 100% 0:00:00 | building clusters | s3 |\n", "[####################] 100% 0:00:00 | chunking clusters | s3 |\n", "[####################] 100% 0:00:15 | aligning clusters | s3 |\n", "[####################] 100% 0:00:00 | concat clusters | s3 |\n" ] } ], "source": [ "s3.run()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# muscle align returns values for bad alignments\n", "ip.assemble.cluster_within.sample_cleanup(data, samples[0])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clusters_totalhidepth_minclusters_hidepthavg_depth_totalavg_depth_mjavg_depth_statsd_depth_totalsd_depth_mjsd_depth_statfiltered_bad_align
1A_01000.06.01000.019.86219.86219.8622.8307172.8307172.8307170.0
1B_01000.06.01000.020.04320.04320.0432.8073392.8073392.8073390.0
1C_01000.06.01000.020.13620.13620.1362.8742832.8742832.8742830.0
1D_01000.06.01000.019.96619.96619.9662.7384022.7384022.7384020.0
2E_01000.06.01000.020.01720.01720.0172.7789772.7789772.7789770.0
2F_01000.06.01000.019.93319.93319.9332.8334632.8334632.8334630.0
2G_01000.06.01000.020.03020.03020.0302.7736442.7736442.7736440.0
2H_01000.06.01000.020.19920.19920.1992.8700872.8700872.8700870.0
3I_01000.06.01000.019.88519.88519.8853.0126033.0126033.0126030.0
3J_01000.06.01000.019.82219.82219.8222.8785962.8785962.8785960.0
3K_01000.06.01000.019.96519.96519.9652.8857882.8857882.8857880.0
3L_01000.06.01000.020.00820.00820.0082.9048132.9048132.9048130.0
\n", "
" ], "text/plain": [ " clusters_total hidepth_min clusters_hidepth avg_depth_total \\\n", "1A_0 1000.0 6.0 1000.0 19.862 \n", "1B_0 1000.0 6.0 1000.0 20.043 \n", "1C_0 1000.0 6.0 1000.0 20.136 \n", "1D_0 1000.0 6.0 1000.0 19.966 \n", "2E_0 1000.0 6.0 1000.0 20.017 \n", "2F_0 1000.0 6.0 1000.0 19.933 \n", "2G_0 1000.0 6.0 1000.0 20.030 \n", "2H_0 1000.0 6.0 1000.0 20.199 \n", "3I_0 1000.0 6.0 1000.0 19.885 \n", "3J_0 1000.0 6.0 1000.0 19.822 \n", "3K_0 1000.0 6.0 1000.0 19.965 \n", "3L_0 1000.0 6.0 1000.0 20.008 \n", "\n", " avg_depth_mj avg_depth_stat sd_depth_total sd_depth_mj \\\n", "1A_0 19.862 19.862 2.830717 2.830717 \n", "1B_0 20.043 20.043 2.807339 2.807339 \n", "1C_0 20.136 20.136 2.874283 2.874283 \n", "1D_0 19.966 19.966 2.738402 2.738402 \n", "2E_0 20.017 20.017 2.778977 2.778977 \n", "2F_0 19.933 19.933 2.833463 2.833463 \n", "2G_0 20.030 20.030 2.773644 2.773644 \n", "2H_0 20.199 20.199 2.870087 2.870087 \n", "3I_0 19.885 19.885 3.012603 3.012603 \n", "3J_0 19.822 19.822 2.878596 2.878596 \n", "3K_0 19.965 19.965 2.885788 2.885788 \n", "3L_0 20.008 20.008 2.904813 2.904813 \n", "\n", " sd_depth_stat filtered_bad_align \n", "1A_0 2.830717 0.0 \n", "1B_0 2.807339 0.0 \n", "1C_0 2.874283 0.0 \n", "1D_0 2.738402 0.0 \n", "2E_0 2.778977 0.0 \n", "2F_0 2.833463 0.0 \n", "2G_0 2.773644 0.0 \n", "2H_0 2.870087 0.0 \n", "3I_0 3.012603 0.0 \n", "3J_0 2.878596 0.0 \n", "3K_0 2.885788 0.0 \n", "3L_0 2.904813 0.0 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data._build_stat(\"s3\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "ename": "IPyradError", "evalue": "hi", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIPyradError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mIPyradError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"hi\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mIPyradError\u001b[0m: hi" ] } ], "source": [ "raise IPyradError(\"hi\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "derep_sort_map() missing 1 required positional argument: 'nthreads'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mself\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mderep_sort_map\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msamples\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mTypeError\u001b[0m: derep_sort_map() missing 1 required positional argument: 'nthreads'" ] } ], "source": [ "self = data\n", "derep_sort_map(s3.data, samples[0], s3.force)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ra.exception()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data.get_params()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }