{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import ipyrad as ip\n",
"import ipyparallel as ipp"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: \n",
" Controller appears to be listening on localhost, but not on this machine.\n",
" If this is true, you should specify Client(...,sshserver='you@oud')\n",
" or instruct your controller to listen on an external IP.\n",
" RuntimeWarning)\n"
]
}
],
"source": [
"ipyclient = ipp.Client()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"New Assembly: 1-pairtest\n"
]
}
],
"source": [
"data = ip.Assembly(\"1-pairtest\")\n",
"data.set_params(\"raw_fastq_path\", \"ipsimdata/pairddrad_example_R*_.fastq.gz\")\n",
"data.set_params(\"barcodes_path\", \"ipsimdata/pairddrad_example_barcodes.txt\")\n",
"data.set_params(\"datatype\", \"pairddrad\")\n",
"data.set_params(\"restriction_overhang\", (\"TGCAG\", \"CGG\"))\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"New Assembly: 2-setest\n"
]
}
],
"source": [
"data = ip.Assembly(\"2-setest\")\n",
"data.set_params(\"raw_fastq_path\", \"ipsimdata/rad_example_R1_.fastq.gz\")\n",
"data.set_params(\"barcodes_path\", \"ipsimdata/rad_example_barcodes.txt\")\n",
"data.set_params(\"datatype\", \"rad\")\n",
"data.set_params(\"restriction_overhang\", (\"TGCAG\", \"\"))\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"New Assembly: 3-refsetest\n"
]
}
],
"source": [
"data = ip.Assembly(\"3-refsetest\")\n",
"data.set_params(\"raw_fastq_path\", \"ipsimdata/rad_example_R1_.fastq.gz\")\n",
"data.set_params(\"barcodes_path\", \"ipsimdata/rad_example_barcodes.txt\")\n",
"data.set_params(\"datatype\", \"rad\")\n",
"data.set_params(\"assembly_method\", \"reference\")\n",
"data.set_params(\"reference_sequence\", \"ipsimdata/rad_example_genome.fa\")\n",
"data.set_params(\"restriction_overhang\", (\"TGCAG\", \"\"))\n",
"#data.get_params()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"New Assembly: 4-refpairtest\n"
]
}
],
"source": [
"data = ip.Assembly(\"4-refpairtest\")\n",
"data.set_params(\"raw_fastq_path\", \"ipsimdata/pairddrad_example_R*_.fastq.gz\")\n",
"data.set_params(\"barcodes_path\", \"ipsimdata/rad_example_barcodes.txt\")\n",
"data.set_params(\"datatype\", \"pairddrad\")\n",
"data.set_params(\"assembly_method\", \"reference\")\n",
"data.set_params(\"reference_sequence\", \"ipsimdata/pairddrad_example_genome.fa\")\n",
"data.set_params(\"restriction_overhang\", (\"TGCAG\", \"CGG\"))\n",
"#data.get_params()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"New Assembly: 5-tortas\n"
]
}
],
"source": [
"data = ip.Assembly(\"5-tortas\")\n",
"data.set_params(\"project_dir\", \"tortas\")\n",
"data.set_params(\"sorted_fastq_path\", \"/home/deren/Dropbox/Maud/fastq-concats/*.gz\")\n",
"data.set_params(\"datatype\", \"pairddrad\")\n",
"data.set_params(\"assembly_method\", \"reference\")\n",
"data.set_params(\"reference_sequence\", \"/home/deren/Dropbox/Maud/lgeorge.genome.fa\")\n",
"data.set_params(\"restriction_overhang\", (\"CATG\", \"AATT\"))\n",
"data.set_params(\"filter_adapters\", 2)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Assembly: 4-refpairtest\n",
"[force] overwriting fastq files previously created by ipyrad.\n",
"This _does not_ affect your original/raw data files.\n",
"[####################] 100% 0:00:04 | sorting reads | s1 |\n",
"[####################] 100% 0:00:02 | writing/compressing | s1 |\n",
"[####################] 100% 0:00:04 | processing reads | s2 |\n"
]
}
],
"source": [
"data.run(\"12\", force=True)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading Assembly: 5-tortas\n",
"from saved path: ~/Documents/ipyrad/tests/tortas/5-tortas.json\n"
]
}
],
"source": [
"#data.run(\"12\", force=True)\n",
"#data = ip.load_json(\"1-pairtest.json\")\n",
"#data = ip.load_json(\"2-setest.json\")\n",
"#data = ip.load_json(\"3-refsetest.json\")\n",
"#data = ip.load_json(\"4-refpairtest.json\")\n",
"data = ip.load_json(\"tortas/5-tortas.json\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" state | \n",
" reads_raw | \n",
" reads_passed_filter | \n",
"
\n",
" \n",
" \n",
" \n",
" AGO02concat | \n",
" 2 | \n",
" 11050294 | \n",
" 10800672 | \n",
"
\n",
" \n",
" AGO08concat | \n",
" 2 | \n",
" 13408401 | \n",
" 13030329 | \n",
"
\n",
" \n",
" AGO09concat | \n",
" 2 | \n",
" 15650127 | \n",
" 15121047 | \n",
"
\n",
" \n",
" AGO11concat | \n",
" 2 | \n",
" 12848936 | \n",
" 12370018 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" state reads_raw reads_passed_filter\n",
"AGO02concat 2 11050294 10800672\n",
"AGO08concat 2 13408401 13030329\n",
"AGO09concat 2 15650127 15121047\n",
"AGO11concat 2 12848936 12370018"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from ipyrad.assemble.cluster_across import *\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from ipyrad.assemble.clustmap import *\n",
"s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)\n",
"samples = list(s3.data.samples.values())\n",
"sample = samples[1]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[####################] 100% 0:00:01 | concatenating | s3 |\n",
"[####################] 100% 0:00:01 | join unmerged pairs | s3 |\n",
"[####################] 100% 0:00:00 | dereplicating | s3 |\n",
"[####################] 100% 0:00:00 | splitting dereps | s3 |\n",
"[####################] 100% 0:00:02 | mapping reads | s3 |\n",
"[####################] 100% 0:00:10 | building clusters | s3 |\n"
]
}
],
"source": [
"s3.data.ipcluster[\"threads\"] = 4\n",
"s3.run()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Assembly: 4-refpairtest\n",
"[####################] 100% 0:00:04 | inferring [H, E] | s4 |\n",
"[####################] 100% 0:00:00 | calculating depths | s5 |\n",
"[####################] 100% 0:00:00 | chunking clusters | s5 |\n",
"[####################] 100% 0:00:25 | consens calling | s5 |\n"
]
}
],
"source": [
"s3.data.run(\"45\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"#s3.data.stats\n",
"data = s3.data\n",
"jobid = 0\n",
"samples = list(data.samples.values())[:4]\n",
"randomseed = 123"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/home/deren/Documents/ipyrad/tests/4-refpairtest_consens/1B_0.consens.gz',\n",
" '/home/deren/Documents/ipyrad/tests/4-refpairtest_consens/1D_0.consens.gz',\n",
" '/home/deren/Documents/ipyrad/tests/4-refpairtest_consens/2H_0.consens.gz',\n",
" '/home/deren/Documents/ipyrad/tests/4-refpairtest_consens/3J_0.consens.gz']"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"conshandles = [\n",
" sample.files.consens[0] for sample in samples if \n",
" sample.stats.reads_consens]\n",
"conshandles.sort()\n",
"assert conshandles, \"no consensus files found\"\n",
"conshandles"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"## concatenate all of the gzipped consens files\n",
"cmd = ['cat'] + conshandles\n",
"groupcons = os.path.join(\n",
" data.dirs.across, \n",
" \"{}-{}-catcons.gz\".format(data.name, jobid))\n",
"LOGGER.debug(\" \".join(cmd))\n",
"with open(groupcons, 'w') as output:\n",
" call = sps.Popen(cmd, stdout=output, close_fds=True)\n",
" call.communicate()\n",
"\n",
"## a string of sed substitutions for temporarily replacing hetero sites\n",
"## skips lines with '>', so it doesn't affect taxon names\n",
"subs = [\"/>/!s/W/A/g\", \"/>/!s/w/A/g\", \"/>/!s/R/A/g\", \"/>/!s/r/A/g\",\n",
" \"/>/!s/M/A/g\", \"/>/!s/m/A/g\", \"/>/!s/K/T/g\", \"/>/!s/k/T/g\",\n",
" \"/>/!s/S/C/g\", \"/>/!s/s/C/g\", \"/>/!s/Y/C/g\", \"/>/!s/y/C/g\"]\n",
"subs = \";\".join(subs)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"## pipe passed data from gunzip to sed.\n",
"cmd1 = [\"gunzip\", \"-c\", groupcons]\n",
"cmd2 = [\"sed\", subs]\n",
"LOGGER.debug(\" \".join(cmd1))\n",
"LOGGER.debug(\" \".join(cmd2))\n",
"\n",
"proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)\n",
"allhaps = groupcons.replace(\"-catcons.gz\", \"-cathaps.gz\")\n",
"with open(allhaps, 'w') as output:\n",
" proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)\n",
" proc2.communicate()\n",
"proc1.stdout.close()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"data.dirs.across = os.path.join(data.name + \"_across\")\n",
"if not os.path.exists(data.dirs.across):\n",
" os.makedirs(data.dirs.across)\n",
" \n",
"import ipyrad"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(None, None)"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"conshandles = [\n",
" sample.files.consens[0] for sample in samples if \n",
" sample.stats.reads_consens]\n",
"conshandles.sort()\n",
"assert conshandles, \"no consensus files found\"\n",
"\n",
"## concatenate all of the gzipped consens files\n",
"cmd = ['cat'] + conshandles\n",
"groupcons = os.path.join(\n",
" data.dirs.across, \n",
" \"{}-{}-catcons.gz\".format(data.name, jobid))\n",
"LOGGER.debug(\" \".join(cmd))\n",
"with open(groupcons, 'w') as output:\n",
" call = sps.Popen(cmd, stdout=output, close_fds=True)\n",
" call.communicate()\n",
"\n",
"## a string of sed substitutions for temporarily replacing hetero sites\n",
"## skips lines with '>', so it doesn't affect taxon names\n",
"subs = [\"/>/!s/W/A/g\", \"/>/!s/w/A/g\", \"/>/!s/R/A/g\", \"/>/!s/r/A/g\",\n",
" \"/>/!s/M/A/g\", \"/>/!s/m/A/g\", \"/>/!s/K/T/g\", \"/>/!s/k/T/g\",\n",
" \"/>/!s/S/C/g\", \"/>/!s/s/C/g\", \"/>/!s/Y/C/g\", \"/>/!s/y/C/g\"]\n",
"subs = \";\".join(subs)\n",
"\n",
"## impute pseudo-haplo information to avoid mismatch at hetero sites\n",
"## the read data with hetero sites is put back into clustered data later.\n",
"## pipe passed data from gunzip to sed.\n",
"cmd1 = [\"gunzip\", \"-c\", groupcons]\n",
"cmd2 = [\"sed\", subs]\n",
"LOGGER.debug(\" \".join(cmd1))\n",
"LOGGER.debug(\" \".join(cmd2))\n",
"\n",
"proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)\n",
"allhaps = groupcons.replace(\"-catcons.gz\", \"-cathaps.gz\")\n",
"with open(allhaps, 'w') as output:\n",
" proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)\n",
" proc2.communicate()\n",
"proc1.stdout.close()\n",
"\n",
"## now sort the file using vsearch\n",
"allsort = groupcons.replace(\"-catcons.gz\", \"-catsort.fa\")\n",
"cmd1 = [ipyrad.bins.vsearch,\n",
" \"--sortbylength\", allhaps,\n",
" \"--fasta_width\", \"0\",\n",
" \"--output\", allsort]\n",
"LOGGER.debug(\" \".join(cmd1))\n",
"proc1 = sps.Popen(cmd1, close_fds=True)\n",
"proc1.communicate()\n"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"from ipyrad.assemble.cluster_across import *\n",
"\n",
"random.seed(randomseed)\n",
"\n",
"## open an iterator to lengthsorted file and grab two lines at at time\n",
"allshuf = groupcons.replace(\"-catcons.gz\", \"-catshuf.fa\")\n",
"outdat = open(allshuf, 'wt')\n",
"indat = open(allsort, 'r')\n",
"idat = izip(iter(indat), iter(indat))\n",
"done = 0\n",
"\n",
"chunk = [next(idat)]\n",
"while not done:\n",
" ## grab 2-lines until they become shorter (unless there's only one)\n",
" oldlen = len(chunk[-1][-1])\n",
" while 1:\n",
" try:\n",
" dat = next(idat)\n",
" except StopIteration:\n",
" done = 1\n",
" break\n",
" if len(dat[-1]) == oldlen:\n",
" chunk.append(dat)\n",
" else:\n",
" ## send the last chunk off to be processed\n",
" random.shuffle(chunk)\n",
" outdat.write(\"\".join(chain(*chunk)))\n",
" ## start new chunk\n",
" chunk = [dat]\n",
" break\n",
"\n",
"## do the last chunk\n",
"random.shuffle(chunk)\n",
"outdat.write(\"\".join(chain(*chunk)))\n",
"\n",
"indat.close()\n",
"outdat.close()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def build_clusters_from_cigars(data, sample):\n",
" \n",
" # get all regions with reads. Generator to yield (str, int, int)\n",
" fullregions = bedtools_merge(data, sample).strip().split(\"\\n\") \n",
" regions = (i.split(\"\\t\") for i in fullregions)\n",
" regions = ((i, int(j), int(k)) for (i, j, k) in regions)\n",
"\n",
" # access reads from bam file using pysam\n",
" bamfile = AlignmentFile(\n",
" os.path.join(data.dirs.refmapping, \n",
" \"{}-mapped-sorted.bam\".format(sample.name)),\n",
" 'rb')\n",
"\n",
" # iterate over all regions\n",
" opath = os.path.join(\n",
" data.dirs.clusts, \"{}.clustS.gz\".format(sample.name))\n",
" out = gzip.open(opath, 'wt')\n",
" idx = 0\n",
" clusters = []\n",
" for reg in regions:\n",
" # uncomment and compare against ref sequence when testing\n",
" #ref = get_ref_region(data.paramsdict[\"reference_sequence\"], *reg)\n",
" reads = bamfile.fetch(*reg)\n",
"\n",
" # store reads in a dict\n",
" rdict = {}\n",
"\n",
" # paired-end data cluster building\n",
" if \"pair\" in data.paramsdict[\"datatype\"]:\n",
" \n",
" # match paired reads together in a dictionary \n",
" for read in reads:\n",
" if read.qname not in rdict:\n",
" rdict[read.qname] = [read, None]\n",
" else:\n",
" rdict[read.qname][1] = read\n",
"\n",
" # sort keys by derep number\n",
" keys = sorted(\n",
" rdict.keys(),\n",
" key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n",
"\n",
" # build the cluster based on map positions, orientation, cigar\n",
" clust = []\n",
" for key in keys:\n",
" r1, r2 = rdict[key]\n",
" if r1 and r2:\n",
"\n",
" #lref = len(ref[1])\n",
" lref = reg[2] - reg[1]\n",
" arr1 = np.zeros(lref, dtype=\"U1\")\n",
" arr2 = np.zeros(lref, dtype=\"U1\")\n",
" arr1.fill(\"-\")\n",
" arr2.fill(\"-\")\n",
"\n",
" # how far ahead of the start does this read begin\n",
" seq = cigared(r1.seq, r1.cigar)\n",
" start = r1.reference_start - reg[1] \n",
" arr1[start:start + len(seq)] = list(seq)\n",
" \n",
" seq = cigared(r2.seq, r2.cigar)\n",
" start = r2.reference_start - reg[1] \n",
" arr2[start:start + len(seq)] = list(seq)\n",
" \n",
" arr3 = join_arrays(arr1, arr2)\n",
" pairseq = \"\".join(arr3)\n",
"\n",
" ori = \"+\"\n",
" if r1.is_reverse:\n",
" ori = \"-\"\n",
" derep = r1.qname.split(\"=\")[-1]\n",
" rname = \"{}:{}-{};size={};{}\".format(*reg, derep, ori)\n",
" clust.append(\"{}\\n{}\".format(rname, pairseq))\n",
"\n",
" # single-end data cluster building\n",
" else: \n",
" for read in reads:\n",
" rdict[read.qname] = read\n",
"\n",
" # sort keys by derep number\n",
" keys = sorted(\n",
" rdict.keys(),\n",
" key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n",
"\n",
" # build the cluster based on map positions, orientation, cigar\n",
" clust = []\n",
" for key in keys:\n",
" r1 = rdict[key]\n",
"\n",
" #aref = np.array(list(ref[1]))\n",
" lref = reg[2] - reg[1]\n",
" arr1 = np.zeros(lref, dtype=\"U1\")\n",
" arr1.fill(\"-\")\n",
"\n",
" # how far ahead of the start does this read begin\n",
" seq = cigared(r1.seq, r1.cigar)\n",
" start = r1.reference_start - reg[1] \n",
" arr1[start:start + len(seq)] = list(seq)\n",
" aseq = \"\".join(arr1)\n",
"\n",
" ori = \"+\"\n",
" if r1.is_reverse:\n",
" ori = \"-\"\n",
" derep = r1.qname.split(\"=\")[-1]\n",
" rname = \"{}:{}-{};size={};{}\".format(*reg, derep, ori)\n",
" clust.append(\"{}\\n{}\".format(rname, aseq))\n",
"\n",
" # store this cluster\n",
" clusters.append(\"\\n\".join(clust))\n",
" idx += 1\n",
"\n",
" # if 1000 clusters stored then write to disk\n",
" if not idx % 10:\n",
" out.write(\"\\n//\\n//\\n\".join(clusters) + \"\\n//\\n//\\n\")\n",
" clusters = []\n",
" \n",
" # write final remaining clusters to disk\n",
" out.write(\"\\n//\\n//\\n\".join(clusters) + \"\\n//\\n//\\n\")\n",
" out.close()\n"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"build_clusters_from_cigars(data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#maxlens, depths = get_quick_depths(data, sample)\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"#sample_cleanup(data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"def get_quick_depths(data, sample):\n",
" \"\"\" iterate over clustS files to get data \"\"\"\n",
"\n",
" ## use existing sample cluster path if it exists, since this\n",
" ## func can be used in step 4 and that can occur after merging\n",
" ## assemblies after step3, and if we then referenced by data.dirs.clusts\n",
" ## the path would be broken.\n",
" if sample.files.clusters:\n",
" pass\n",
" else:\n",
" ## set cluster file handles\n",
" sample.files.clusters = os.path.join(\n",
" data.dirs.clusts, sample.name + \".clustS.gz\")\n",
"\n",
" ## get new clustered loci\n",
" fclust = data.samples[sample.name].files.clusters\n",
" clusters = gzip.open(fclust, 'rt')\n",
" pairdealer = izip(*[iter(clusters)] * 2)\n",
"\n",
" ## storage\n",
" depths = []\n",
" maxlen = []\n",
"\n",
" ## start with cluster 0\n",
" tdepth = 0\n",
" tlen = 0\n",
"\n",
" ## iterate until empty\n",
" while 1:\n",
" ## grab next\n",
" try:\n",
" name, seq = next(pairdealer)\n",
" except StopIteration:\n",
" break\n",
"\n",
" ## if not the end of a cluster\n",
" #print name.strip(), seq.strip()\n",
" if name.strip() == seq.strip():\n",
" depths.append(tdepth)\n",
" maxlen.append(tlen)\n",
" tlen = 0\n",
" tdepth = 0\n",
"\n",
" else:\n",
" try:\n",
" tdepth += int(name.strip().split(\"=\")[-1][:-2])\n",
" tlen = len(seq)\n",
" except:\n",
" print(name)\n",
"\n",
" ## return\n",
" clusters.close()\n",
" return np.array(maxlen), np.array(depths)\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"def sample_cleanup(data, sample):\n",
" \"\"\" stats, cleanup, and link to samples \"\"\"\n",
"\n",
" ## get maxlen and depths array from clusters\n",
" maxlens, depths = get_quick_depths(data, sample)\n",
"\n",
" ## Test if depths is non-empty, but just full of zeros.\n",
" if not depths.max():\n",
" print(\" no clusters found for {}\".format(sample.name))\n",
" return \n",
"\n",
" else:\n",
" ## store which min was used to calculate hidepth here\n",
" sample.stats_dfs.s3[\"hidepth_min\"] = (\n",
" data.paramsdict[\"mindepth_majrule\"])\n",
"\n",
" # If our longest sequence is longer than the current max_fragment_len\n",
" # then update max_fragment_length. For assurance we require that\n",
" # max len is 4 greater than maxlen, to allow for pair separators.\n",
" hidepths = depths >= data.paramsdict[\"mindepth_majrule\"]\n",
" maxlens = maxlens[hidepths]\n",
"\n",
" ## Handle the case where there are no hidepth clusters\n",
" if maxlens.any():\n",
" maxlen = int(maxlens.mean() + (2. * maxlens.std()))\n",
" else:\n",
" maxlen = 0\n",
" if maxlen > data._hackersonly[\"max_fragment_length\"]:\n",
" data._hackersonly[\"max_fragment_length\"] = maxlen + 4\n",
"\n",
" ## make sense of stats\n",
" keepmj = depths[depths >= data.paramsdict[\"mindepth_majrule\"]]\n",
" keepstat = depths[depths >= data.paramsdict[\"mindepth_statistical\"]]\n",
"\n",
" ## sample summary stat assignments\n",
" sample.stats[\"state\"] = 3\n",
" sample.stats[\"clusters_total\"] = depths.shape[0]\n",
" sample.stats[\"clusters_hidepth\"] = keepmj.shape[0]\n",
"\n",
" ## store depths histogram as a dict. Limit to first 25 bins\n",
" bars, bins = np.histogram(depths, bins=range(1, 26))\n",
" sample.depths = {int(i): int(v) for i, v in zip(bins, bars) if v}\n",
"\n",
" ## sample stat assignments\n",
" ## Trap numpy warnings (\"mean of empty slice\") printed by samples\n",
" ## with few reads.\n",
" with warnings.catch_warnings():\n",
" warnings.simplefilter(\"ignore\", category=RuntimeWarning)\n",
" sample.stats_dfs.s3[\"merged_pairs\"] = sample.stats.reads_merged\n",
" sample.stats_dfs.s3[\"clusters_total\"] = depths.shape[0]\n",
" try:\n",
" sample.stats_dfs.s3[\"clusters_hidepth\"] = (\n",
" int(sample.stats[\"clusters_hidepth\"]))\n",
" except ValueError:\n",
" ## Handle clusters_hidepth == NaN\n",
" sample.stats_dfs.s3[\"clusters_hidepth\"] = 0\n",
" sample.stats_dfs.s3[\"avg_depth_total\"] = depths.mean()\n",
" sample.stats_dfs.s3[\"avg_depth_mj\"] = keepmj.mean()\n",
" sample.stats_dfs.s3[\"avg_depth_stat\"] = keepstat.mean()\n",
" sample.stats_dfs.s3[\"sd_depth_total\"] = depths.std()\n",
" sample.stats_dfs.s3[\"sd_depth_mj\"] = keepmj.std()\n",
" sample.stats_dfs.s3[\"sd_depth_stat\"] = keepstat.std()\n",
"\n",
" ## Get some stats from the bam files\n",
" ## This is moderately hackish. samtools flagstat returns\n",
" ## the number of reads in the bam file as the first element\n",
" ## of the first line, this call makes this assumption.\n",
" if not data.paramsdict[\"assembly_method\"] == \"denovo\":\n",
" ## shorter names\n",
" mapf = os.path.join(\n",
" data.dirs.refmapping, sample.name + \"-mapped-sorted.bam\")\n",
" umapf = os.path.join(\n",
" data.dirs.refmapping, sample.name + \"-unmapped.bam\")\n",
"\n",
" ## get from unmapped\n",
" cmd1 = [ip.bins.samtools, \"flagstat\", umapf]\n",
" proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)\n",
" result1 = proc1.communicate()[0]\n",
"\n",
" ## get from mapped\n",
" cmd2 = [ip.bins.samtools, \"flagstat\", mapf]\n",
" proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)\n",
" result2 = proc2.communicate()[0]\n",
"\n",
" ## store results\n",
" ## If PE, samtools reports the _actual_ number of reads mapped, both \n",
" ## R1 and R2, so here if PE divide the results by 2 to stay consistent\n",
" ## with how we've been reporting R1 and R2 as one \"read pair\"\n",
" if \"pair\" in data.paramsdict[\"datatype\"]:\n",
" sample.stats[\"refseq_unmapped_reads\"] = int(result1.split()[0]) / 2\n",
" sample.stats[\"refseq_mapped_reads\"] = int(result2.split()[0]) / 2\n",
" else:\n",
" sample.stats[\"refseq_unmapped_reads\"] = int(result1.split()[0])\n",
" sample.stats[\"refseq_mapped_reads\"] = int(result2.split()[0])\n",
"\n",
" unmapped = os.path.join(data.dirs.refmapping, sample.name + \"-unmapped.bam\")\n",
" samplesam = os.path.join(data.dirs.refmapping, sample.name + \".sam\")\n",
" for rfile in [unmapped, samplesam]:\n",
" if os.path.exists(rfile):\n",
" os.remove(rfile)\n",
"\n",
" # if loglevel==DEBUG\n",
" log_level = ip.logger.getEffectiveLevel()\n",
" if not log_level == 10:\n",
" ## Clean up loose files only if not in DEBUG\n",
" ##- edits/*derep, utemp, *utemp.sort, *htemp, *clust.gz\n",
" derepfile = os.path.join(data.dirs.edits, sample.name + \"_derep.fastq\")\n",
" mergefile = os.path.join(data.dirs.edits, sample.name + \"_merged_.fastq\")\n",
" uhandle = os.path.join(data.dirs.clusts, sample.name + \".utemp\")\n",
" usort = os.path.join(data.dirs.clusts, sample.name + \".utemp.sort\")\n",
" hhandle = os.path.join(data.dirs.clusts, sample.name + \".htemp\")\n",
" clusters = os.path.join(data.dirs.clusts, sample.name + \".clust.gz\")\n",
"\n",
" for rfile in [derepfile, mergefile, uhandle, usort, hhandle, clusters]:\n",
" if os.path.exists(rfile):\n",
" os.remove(rfile)\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# optimize speed of this next\n",
"build_clusters_from_cigars(data, sample)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Consensus for refmapped data -- store all (even long pairs)?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"### Write each out to a sam file...\n",
"# newconsensus()\n",
"# \n",
"data._este = data.stats.error_est.mean()\n",
"data._esth = data.stats.hetero_est.mean()\n",
"\n",
"clusters = open(os.path.join(data.dirs.clusts, \"{}.clustS.gz\".format(sample.name)), 'r')\n",
"clusters.read()\n",
"\n",
"# plan to fill an h5 for this sample\n",
"tmp5 = consenshandle.replace(\"_tmpcons.\", \"_tmpcats.\")\n",
"with h5py.File(tmp5, 'w') as io5:\n",
" io5.create_dataset(\"cats\", (optim, maxlen, 4), dtype=np.uint32)\n",
" io5.create_dataset(\"alls\", (optim, ), dtype=np.uint8)\n",
" io5.create_dataset(\"chroms\", (optim, 3), dtype=np.int64)\n",
"\n",
" ## local copies to use to fill the arrays\n",
" catarr = io5[\"cats\"][:]\n",
" nallel = io5[\"alls\"][:]\n",
" refarr = io5[\"chroms\"][:]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"### Step 6 for refmapped pairs: \n",
"# 1. convert all sams to bams and make a merged mapped-sorted.bam\n",
"# 2. get overlapping regions with bedtools_merge()\n",
"# 3. pull consens reads in aligned regions with bamfile.fetch()\n",
"# 4. store the consensus sequence in h5.\n",
"# 5. store the variants in h5.\n",
"# 6. store the depth of variants in h5.\n",
"# 7. "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# get all regions with reads. Generator to yield (str, int, int)\n",
"fullregions = bedtools_merge(data, sample).strip().split(\"\\n\") \n",
"regions = (i.split(\"\\t\") for i in fullregions)\n",
"regions = ((i, int(j), int(k)) for (i, j, k) in regions)\n",
"\n",
"# access reads from bam file using pysam\n",
"samfile = AlignmentFile(\n",
" os.path.join(data.dirs.refmapping, \n",
" \"{}-mapped-sorted.bam\".format(sample.name)),\n",
" 'rb')\n"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"reg = next(regions)\n",
"ref = get_ref_region(data.paramsdict[\"reference_sequence\"], *reg)\n",
"reads = samfile.fetch(*reg)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# match paired reads together in a dictionary\n",
"rdict = {}\n",
"for read in reads:\n",
" rdict[read.qname] = read\n",
"\n",
"# sort keys by derep number\n",
"keys = sorted(\n",
" rdict.keys(),\n",
" key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n",
"\n",
"# build the cluster based on map positions, orientation, cigar\n",
"clust = []\n",
"for key in keys:\n",
" r1 = rdict[key]\n",
"\n",
" aref = np.array(list(ref[1]))\n",
" arr1 = np.zeros(aref.size, dtype=\"U1\")\n",
" arr1.fill(\"-\")\n",
"\n",
" # how far ahead of the start does this read begin\n",
" seq = cigared(r1.seq, r1.cigar)\n",
" start = r1.reference_start - reg[1] \n",
" arr1[start:start + len(seq)] = list(seq)\n",
" aseq = \"\".join(arr1)\n",
"\n",
" ori = \"+\"\n",
" if r1.is_reverse:\n",
" ori = \"-\"\n",
" derep = r1.qname.split(\"=\")[-1]\n",
" rname = \"{}:{}-{};size={};{}\".format(*reg, derep, ori)\n",
" clust.append(\"{}\\n{}\".format(rname, aseq))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['MT:96809-96900;size=18;+\\nTGCAGCTGCGGTAGTTAACGAACAGCCTGTCTTGTCTAAAGGGTTAAAAATCAGGTCCGGTGTACAGGCGACGATAGAGAACCCCGGCCTA',\n",
" 'MT:96809-96900;size=1;+\\nTGCAGCTGCGGTAGTTAACGAACAGCCTGTCTTGTCTAAAGGGTTAAAAATCAGGTCCGGTGTACAAGCGACGATAGAGAACCCCGGCCTA',\n",
" 'MT:96809-96900;size=1;+\\nTGCAGCTGCGGTAGTTAACGAACAGCCTGTCTTGTCTAAAGGGTTAAAAATCAGGTCCGGTGTACAGGCGACGACAGAGAACCCCGGCCTA']"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clust"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'37013bcb5c4c2a2541dacf4f2e807af0;size=16': [,\n",
" None]}"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rdict"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"ename": "IPyradError",
"evalue": "None",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIPyradError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmapping_reads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36mmapping_reads\u001b[0;34m(data, sample, nthreads)\u001b[0m\n\u001b[1;32m 1498\u001b[0m \u001b[0;31m# -O = Output file format, in this case bam\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1499\u001b[0m \u001b[0;31m# -o = Output file name\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1500\u001b[0;31m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1501\u001b[0m \u001b[0;31m# (cmd5) samtools bam2fq -v 45 [in.bam]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1502\u001b[0m \u001b[0;31m# -v45 set the default qscore arbirtrarily high\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mIPyradError\u001b[0m: None"
]
}
],
"source": [
"mapping_reads(data, sample, 2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Assembly: 4-refpairtest\n",
"[####################] 100% 0:00:00 | calculating depths | s5 |\n",
"[####################] 100% 0:00:00 | chunking clusters | s5 |\n",
"[####################] 100% 0:00:24 | consens calling | s5 |\n"
]
}
],
"source": [
"s3.data.run(\"5\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"subsamples = list(s3.data.samples.values())\n",
"subsamples.sort(key=lambda x: x.stats.clusters_hidepth, reverse=True)\n",
"jobs = {}"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"ename": "IndexError",
"evalue": "index 499 is out of bounds for axis 0 with size 499",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mipyrad\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massemble\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjointestimate\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0moptim\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/jointestimate.py\u001b[0m in \u001b[0;36moptim\u001b[0;34m(data, sample)\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[0;31m## get array of all clusters data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 280\u001b[0;31m \u001b[0mstacked\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstackarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 281\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[0;31m## get base frequencies\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/jointestimate.py\u001b[0m in \u001b[0;36mstackarray\u001b[0;34m(data, sample)\u001b[0m\n\u001b[1;32m 254\u001b[0m dtype=np.uint64).T\n\u001b[1;32m 255\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 256\u001b[0;31m \u001b[0mstacked\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnclust\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mcatg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcatg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 257\u001b[0m \u001b[0mnclust\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mIndexError\u001b[0m: index 499 is out of bounds for axis 0 with size 499"
]
}
],
"source": [
"from ipyrad.assemble.jointestimate import *\n",
"optim(data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(, 499, 495, 499, 495)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample, _, _, nhidepth, maxlen = recal_hidepth(data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"data\n",
"concat_multiple_edits(data, sample)\n",
"merge_pairs_with_vsearch(data, sample, True)\n",
"merge_end_to_end(data, sample, True, True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"dereplicate(data, sample, 2)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"cluster(data, sample, 2, 1)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"build_clusters(data, sample, 5)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"muscle_chunker(data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"for idx in range(10):\n",
" handle = os.path.join(s3.data.tmpdir, \n",
" \"{}_chunk_{}.ali\".format(sample.name, idx))\n",
" align_and_parse(handle, 5, 0)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"reconcat(data, sample)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[####################] 100% 0:00:01 | concatenating | s3 |\n",
"[####################] 100% 0:00:01 | join unmerged pairs | s3 |\n",
"[####################] 100% 0:00:00 | dereplicating | s3 |\n",
"[####################] 100% 0:00:00 | splitting dereps | s3 |\n",
"[####################] 100% 0:00:02 | mapping reads | s3 |\n",
"[####################] 100% 0:00:37 | building clusters | s3 |\n"
]
}
],
"source": [
"s3.run()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([460, 475, 470, 482, 468, 463, 488, 445, 474, 490, 446, 469, 482,\n",
" 457, 457, 470, 477, 445, 442, 468, 486, 447, 473, 460, 484, 467,\n",
" 482, 442, 446, 447, 458, 471, 455, 476, 480, 443, 461, 457, 479,\n",
" 452, 444, 464, 473, 477, 448, 446, 453, 488, 490, 482, 471, 483,\n",
" 480, 450, 456, 482, 454, 462, 457, 456, 455, 463, 447, 488, 457,\n",
" 446, 472, 450, 472, 482, 472, 451, 491, 479, 469, 446, 489, 486,\n",
" 463, 463, 489, 476, 458, 484, 463, 458, 443, 479, 488, 469, 470,\n",
" 474, 454, 447, 447, 489, 451, 449, 453, 491, 484, 471, 483, 481,\n",
" 483, 472, 468, 457, 464, 446, 490, 462, 452, 461, 457, 474, 469,\n",
" 481, 458, 461, 446, 472, 455, 455, 488, 471, 482, 489, 466, 461,\n",
" 473, 477, 466, 488, 461, 487, 453, 466, 450, 472, 458, 448, 490,\n",
" 484, 476, 445, 488, 451, 484, 486, 463, 469, 452, 462, 453, 476,\n",
" 448, 445, 468, 453, 463, 469, 487, 445, 453, 469, 447, 444, 464,\n",
" 488, 468, 448, 461, 465, 483, 446, 447, 473, 448, 448, 444, 481,\n",
" 464, 474, 464, 459, 461, 458, 451, 449, 468, 445, 491, 443, 448,\n",
" 475, 450, 454, 454, 467, 478, 452, 451, 468, 480, 468, 464, 484,\n",
" 462, 484, 475, 457, 463, 453, 473, 457, 486, 469, 485, 462, 464,\n",
" 442, 475, 473, 470, 482, 444, 456, 479, 472, 477, 446, 443, 471,\n",
" 473, 489, 467, 463, 476, 481, 448, 450, 460, 452, 458, 479, 444,\n",
" 457, 489, 488, 444, 453, 472, 461, 481, 451, 450, 491, 473, 480,\n",
" 491, 489, 468, 458, 472, 460, 450, 462, 447, 457, 465, 452, 468,\n",
" 475, 465, 491, 447, 453, 448, 471, 490, 472, 446, 450, 478, 468,\n",
" 455, 470, 454, 464, 479, 472, 456, 475, 479, 446, 474, 470, 474,\n",
" 446, 491, 479, 470, 457, 457, 482, 453, 468, 457, 454, 453, 445,\n",
" 489, 477, 451, 478, 485, 456, 485, 449, 485, 478, 468, 462, 454,\n",
" 486, 482, 471, 488, 444, 485, 450, 485, 449, 444, 453, 482, 484,\n",
" 472, 456, 474, 461, 487, 468, 482, 480, 459, 443, 489, 447, 444,\n",
" 453, 447, 470, 486, 442, 442, 487, 445, 471, 480, 475, 450, 456,\n",
" 473, 468, 455, 452, 491, 470, 479, 469, 469, 442, 489, 451, 476,\n",
" 464, 477, 487, 484, 468, 454, 462, 460, 476, 442, 476, 489, 481,\n",
" 477, 485, 456, 453, 476, 471, 473, 453, 480, 490, 479, 488, 470,\n",
" 483, 483, 450, 451, 459, 469, 456, 461, 458, 451, 450, 446, 482,\n",
" 446, 448, 472, 489, 485, 489, 449, 477, 488, 452, 460, 450, 471,\n",
" 444, 452, 456, 452, 467, 455, 489, 481, 474, 463, 456, 475, 453,\n",
" 445, 487, 481, 445, 476, 476, 483, 456, 455, 444, 443, 452, 478,\n",
" 488, 452, 471, 452, 483, 484, 490, 457, 487, 464, 455, 480, 491,\n",
" 475, 449, 489, 447, 468, 468, 485, 475, 484, 455, 444, 450, 466,\n",
" 448, 442, 459, 488, 482, 455, 474, 480, 482, 474, 484, 488, 469,\n",
" 475, 481, 482, 446, 473]),\n",
" array([17, 14, 17, 20, 25, 21, 18, 25, 21, 24, 22, 23, 21, 22, 23, 20, 17,\n",
" 22, 19, 20, 25, 26, 17, 21, 19, 25, 19, 21, 20, 16, 21, 18, 22, 22,\n",
" 20, 21, 22, 23, 20, 19, 19, 17, 16, 27, 21, 20, 17, 19, 20, 18, 15,\n",
" 20, 24, 20, 19, 19, 20, 23, 20, 23, 20, 19, 20, 23, 22, 25, 17, 27,\n",
" 22, 26, 22, 19, 21, 18, 21, 20, 19, 20, 25, 19, 19, 16, 23, 19, 17,\n",
" 18, 18, 21, 16, 19, 26, 22, 22, 23, 18, 22, 21, 24, 18, 23, 19, 18,\n",
" 20, 18, 21, 22, 21, 22, 19, 19, 18, 20, 17, 19, 22, 25, 20, 21, 19,\n",
" 21, 19, 20, 24, 19, 22, 22, 20, 19, 21, 20, 17, 21, 19, 19, 22, 19,\n",
" 19, 16, 19, 22, 18, 19, 15, 18, 18, 20, 20, 25, 17, 21, 19, 18, 18,\n",
" 22, 21, 21, 19, 21, 14, 25, 19, 17, 20, 21, 14, 20, 21, 20, 18, 25,\n",
" 27, 16, 19, 18, 21, 22, 15, 18, 19, 21, 21, 19, 17, 20, 19, 20, 22,\n",
" 24, 18, 18, 18, 25, 19, 19, 19, 17, 26, 18, 20, 22, 20, 22, 22, 21,\n",
" 21, 21, 21, 18, 18, 19, 19, 19, 25, 17, 22, 21, 17, 18, 18, 18, 20,\n",
" 22, 17, 19, 19, 20, 25, 19, 25, 18, 17, 25, 15, 19, 19, 23, 19, 22,\n",
" 23, 17, 15, 22, 16, 20, 24, 23, 19, 18, 24, 19, 20, 20, 19, 19, 18,\n",
" 26, 18, 18, 23, 20, 18, 21, 22, 25, 23, 16, 19, 25, 24, 22, 21, 20,\n",
" 20, 18, 25, 17, 18, 14, 17, 17, 18, 19, 17, 19, 12, 15, 24, 16, 19,\n",
" 15, 18, 20, 21, 17, 19, 21, 26, 19, 22, 19, 19, 21, 29, 19, 22, 21,\n",
" 22, 19, 23, 17, 25, 20, 21, 20, 23, 18, 21, 19, 27, 14, 18, 16, 19,\n",
" 21, 17, 25, 17, 19, 21, 20, 23, 22, 18, 24, 20, 20, 21, 25, 15, 18,\n",
" 17, 21, 27, 20, 24, 23, 21, 20, 26, 20, 23, 20, 21, 23, 21, 19, 18,\n",
" 21, 19, 21, 15, 22, 22, 23, 21, 19, 20, 14, 21, 25, 18, 20, 13, 17,\n",
" 26, 23, 19, 18, 14, 22, 25, 21, 17, 23, 23, 17, 20, 19, 22, 20, 21,\n",
" 18, 17, 17, 19, 26, 19, 18, 21, 20, 19, 22, 15, 20, 14, 19, 22, 23,\n",
" 20, 17, 23, 16, 17, 18, 18, 20, 16, 16, 14, 22, 19, 19, 22, 16, 22,\n",
" 24, 18, 21, 22, 18, 22, 20, 25, 17, 16, 25, 27, 20, 22, 22, 23, 21,\n",
" 21, 23, 15, 17, 22, 21, 23, 17, 16, 26, 20, 17, 22, 24, 21, 21, 24,\n",
" 21, 20, 21, 21, 21, 19, 24, 18, 22, 23, 21, 15, 18, 21, 27, 14, 17,\n",
" 21, 23, 26, 23, 25, 22, 20, 20, 18, 19, 20, 21, 25, 13, 21, 17, 19,\n",
" 18, 21, 19, 17, 21, 19]))"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"maxlens, depths = get_quick_depths(data, sample)\n",
"maxlens, depths"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"data = s3.data\n",
"samples = list(s3.data.samples.values())\n",
"sample = samples[1]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"sample = samples[1]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"concat_multiple_edits(data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"merge_end_to_end(data, sample, False)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"dereplicate(data, sample, 2)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"split_endtoend_reads(data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"mapping_reads(data, sample, 2)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"build_ref_cigars(data, sample)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#samtools mpileup -uf ref.fa aln1.bam aln2.bam | bcftools view -bvcg - > var.raw.bcf\n",
"! samtools mpileup -ur /home/deren/Dropbox/opbox/Maud/lgeorge.genome.fa "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"#regions = bedtools_merge(data, sample).strip().split(\"\\n\")\n",
"fullregions = bedtools_merge(data, sample).strip().split(\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"regions = (i.split(\"\\t\") for i in fullregions)\n",
"regions = ((i, int(j), int(k)) for (i, j, k) in regions)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def get_ref_region(reference, contig, rstart, rend):\n",
" \"returns the reference sequence over a given region\"\n",
" cmd = [\n",
" ip.bins.samtools, 'faidx', \n",
" reference,\n",
" \"{}:{}-{}\".format(contig, rstart + 1, rend),\n",
" ]\n",
" stdout, err = sps.Popen(cmd, stdout=sps.PIPE).communicate()\n",
" name, seq = stdout.decode().split(\"\\n\", 1)\n",
" listseq = [name, seq.replace(\"\\n\", \"\")]\n",
" return listseq\n"
]
},
{
"cell_type": "code",
"execution_count": 527,
"metadata": {},
"outputs": [],
"source": [
"def build_ref_cigars(data, sample):\n",
" \n",
" # get all regions with reads. Generator to yield (str, int, int)\n",
" #fullregions = bedtools_merge(data, sample).strip().split(\"\\n\") \n",
" regions = (i.split(\"\\t\") for i in fullregions)\n",
" regions = ((i, int(j), int(k)) for (i, j, k) in regions)\n",
"\n",
" # access reads from bam file using pysam\n",
" samfile = AlignmentFile(\n",
" os.path.join(data.dirs.refmapping, \n",
" \"{}-mapped-sorted.bam\".format(sample.name)),\n",
" 'rb')\n",
"\n",
" # iterate over all regions\n",
" out = open(\"test.clustS\", 'w')\n",
" idx = 0\n",
" clusters = []\n",
" for reg in regions:\n",
" ref = get_ref_region(data.paramsdict[\"reference_sequence\"], *reg)\n",
" reads = samfile.fetch(*reg)\n",
"\n",
" # match paired reads together in a dictionary\n",
" rdict = {}\n",
" for read in reads:\n",
" if read.qname not in rdict:\n",
" rdict[read.qname] = [read, None]\n",
" else:\n",
" rdict[read.qname][1] = read\n",
"\n",
" # sort keys by derep number\n",
" keys = sorted(\n",
" rdict.keys(),\n",
" key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n",
"\n",
" # build the cluster based on map positions, orientation, cigar\n",
" clust = []\n",
" for key in keys:\n",
" r1, r2 = rdict[key]\n",
" if r1 and r2:\n",
"\n",
" aref = np.array(list(ref[1]))\n",
" arr1 = np.zeros(aref.size, dtype=\"U1\")\n",
" arr2 = np.zeros(aref.size, dtype=\"U1\")\n",
" arr1.fill(\"-\")\n",
" arr2.fill(\"-\")\n",
"\n",
" try:\n",
" # how far ahead of the start does this read begin\n",
" seq = cigared(r1.seq, r1.cigar)\n",
" start = r1.reference_start - reg[1] \n",
" arr1[start:start + len(seq)] = list(seq)\n",
"\n",
" seq = cigared(r2.seq, r2.cigar)\n",
" start = r2.reference_start - reg[1] \n",
" arr2[start:start + len(seq)] = list(seq)\n",
"\n",
" arr3 = join_arrays(arr1, arr2)\n",
" pairseq = \"\".join(arr3)\n",
" derep = r1.qname.split(\"=\")[-1]\n",
" clust.append(\"{}\\n{}\".format(\"{}:{}-{};size={}\"\n",
" .format(*reg, derep), pairseq))\n",
" except ValueError:\n",
" print(reg)\n",
" clusters.append(\"\\n\".join(clust))\n",
" idx += 1\n",
" if not idx % 1000:\n",
" out.write(\"\\n//\\n//\\n\".join(clusters))\n",
" out.close()\n"
]
},
{
"cell_type": "code",
"execution_count": 549,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def build_ref_cigars(data, sample):\n",
" \n",
" # get all regions with reads. Generator to yield (str, int, int)\n",
" #fullregions = bedtools_merge(data, sample).strip().split(\"\\n\") \n",
" regions = (i.split(\"\\t\") for i in fullregions)\n",
" regions = ((i, int(j), int(k)) for (i, j, k) in regions)\n",
"\n",
" # access reads from bam file using pysam\n",
" samfile = AlignmentFile(\n",
" os.path.join(data.dirs.refmapping, \n",
" \"{}-mapped-sorted.bam\".format(sample.name)),\n",
" 'rb')\n",
"\n",
" # iterate over all regions\n",
" opath = os.path.join(\n",
" data.dirs.refmapping, \"{}.clustS.gz\".format(sample.name))\n",
" out = gzip.open(opath, 'w')\n",
" idx = 0\n",
" clusters = []\n",
" for reg in regions:\n",
" ref = get_ref_region(data.paramsdict[\"reference_sequence\"], *reg)\n",
" reads = samfile.fetch(*reg)\n",
"\n",
" # match paired reads together in a dictionary\n",
" rdict = {}\n",
" for read in reads:\n",
" if read.qname not in rdict:\n",
" rdict[read.qname] = [read, None]\n",
" else:\n",
" rdict[read.qname][1] = read\n",
"\n",
" # sort keys by derep number\n",
" keys = sorted(\n",
" rdict.keys(),\n",
" key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n",
"\n",
" # build the cluster based on map positions, orientation, cigar\n",
" clust = []\n",
" for key in keys:\n",
" r1, r2 = rdict[key]\n",
" if r1 and r2:\n",
"\n",
" aref = np.array(list(ref[1]))\n",
" arr1 = np.zeros(aref.size, dtype=\"U1\")\n",
" arr2 = np.zeros(aref.size, dtype=\"U1\")\n",
" arr1.fill(\"-\")\n",
" arr2.fill(\"-\")\n",
"\n",
" # how far ahead of the start does this read begin\n",
" seq = cigared(r1.seq, r1.cigar)\n",
" start = r1.reference_start - reg[1] \n",
" arr1[start:start + len(seq)] = list(seq)\n",
" \n",
" seq = cigared(r2.seq, r2.cigar)\n",
" start = r2.reference_start - reg[1] \n",
" arr2[start:start + len(seq)] = list(seq)\n",
" \n",
" arr3 = join_arrays(arr1, arr2)\n",
" pairseq = \"\".join(arr3)\n",
"\n",
" derep = r1.qname.split(\"=\")[-1]\n",
" rname = \"{}:{}-{};size={}\".format(*reg, derep)\n",
" clust.append(\"{}\\n{}\".format(rname, pairseq))\n",
" clusters.append(\"\\n\".join(clust))\n",
" idx += 1\n",
" if not idx % 100:\n",
" out.write(\"\\n//\\n//\\n\".join(clusters).encode())\n",
" out.close()\n"
]
},
{
"cell_type": "code",
"execution_count": 550,
"metadata": {},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbuild_ref_cigars\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m\u001b[0m in \u001b[0;36mbuild_ref_cigars\u001b[0;34m(data, sample)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0mclusters\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mreg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mregions\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0mref\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_ref_region\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparamsdict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"reference_sequence\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mreg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 23\u001b[0m \u001b[0mreads\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msamfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfetch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mreg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36mget_ref_region\u001b[0;34m(reference, contig, rstart, rend)\u001b[0m\n\u001b[1;32m 2150\u001b[0m \"\"\"\n\u001b[1;32m 2151\u001b[0m \u001b[0minp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtmpdir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"{}_derep.fastq\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2152\u001b[0;31m \u001b[0mout1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtmpdir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"{}_R1-tmp.fastq\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2153\u001b[0m \u001b[0mout2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtmpdir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"{}_R2-tmp.fastq\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2154\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/lib/python3.6/subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors)\u001b[0m\n\u001b[1;32m 707\u001b[0m \u001b[0mc2pread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc2pwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 708\u001b[0m \u001b[0merrread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 709\u001b[0;31m restore_signals, start_new_session)\n\u001b[0m\u001b[1;32m 710\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 711\u001b[0m \u001b[0;31m# Cleanup if the child failed starting.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/lib/python3.6/subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)\u001b[0m\n\u001b[1;32m 1296\u001b[0m \u001b[0merrpipe_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbytearray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1297\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1298\u001b[0;31m \u001b[0mpart\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrpipe_read\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m50000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1299\u001b[0m \u001b[0merrpipe_data\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mpart\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1300\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mpart\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrpipe_data\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m50000\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"c = build_ref_cigars(data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 547,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"b'Contig0:3481-3748;size=3\\nCATGAGGTCCTTAGGGAGGGAATCAACGAAGTAGCCAAACACTTGCTGAATGTTGAAATGAGCCCCTACAG------------------------------------------------------------------------------------------------------------------------TTCCAGGGTCACACCAGCCAAGGCACTGTTGCAGTCACTTAACAAAGCAGTGTGATTCCCAGTTGGCCACAGAATT\\n//\\n//\\nContig0:4544-5088;size=1\\nCATGCTTGTTTAAAGTTGTGCAGTGCTCTTTTCTGGCAGGGGATTGGTCCTGCTTTGAGCAGGGGGTTGGA--------------------------------------------------------------------------------------------------------------------------------------TGGACTGGGAGCCCCTCTGTCAGCTCCCTGCTCCCCTAAGTTCCCTGTGCTGCAGTCGCCCAGCAGGCTATCAATT-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:4544-5088;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGCTGCTCCTGCCCTCTGCCTTGGAGCTGCTCCCAGAGACTCCTGCTTGCTGTGCAGGGAGGAAAGG------------------------------------------------------------------------------------GAGGGAGAGAGACAGAGAGAGCTTGGGGCAGCAGCTGCTGTCTCAACTTCCTGATCCACTGACAAACAATGCAATT\\nContig0:4544-5088;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGCTGCTCCTGCCCTCTGCCTTGGAGCTGCTCCCAGAGACTCCTGCTTGCTGCGCAGGGAGGAAAGG------------------------------------------------------------------------------------GAGGGAGAGAGACAGAGAGAGCTTGGGGCAGCAGCTGCTGTCTCAACTTCCTGATCCACTGACAAACAATGCAATT\\nContig0:4544-5088;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGGTGTCCCCCTCCTCCCTGCTCCTGCACCCTGCTTACCTCTTCTCCATATAGAGCAGGGAGGGGACAC-GAGGGAGAGAGACAGAGAGAGCTTGGGGCAGCAGCTGCTGTCTCAACTTCCTGATCCACTGACAAACAATGCAATT\\n//\\n//\\nContig0:7193-7952;size=1\\nAATTGATGGGACCCTGGAACATTTCTAGCTTGTACTGATTGTTTTGGGATTTTTTTGTTTGTTCTCTGGTTTCAATAGCAGTTGGGTGCCCAGATACCTGGGGGGAGCAGTTGGAGGGGGGTTCGCCTTCCTCTGCAGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:7193-7952;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------CATGGAGCACAGGACATTTGCAGGTTTAAACTAGTGTAAATGGTGAATCCTCTGTGACTTAAAGTCTTTAA----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GAAGCAGCAAATAGATGGGGTGAGGGCAGGTGAATGGAGCGACTCCAATCAGATCATGGGTTTTAGCAGACTAATT--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:7193-7952;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTAACATTGCTGTGCGATTCCTGGACCACTGCAGAATTCAGTGAAATCTAGGGGTCTTGGCTGCCTAAATACAT------------------------------------------------------------CCTAGCACCAGGGGACCTCCAGGTGCTACTGTAATATACAAGGGGATGATGATGATAGGATGCAGCACATG\\n//\\n//\\nContig0:13327-13845;size=4\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGCTTTCTCATTCACACACTTTTCATAGGGTGCTACAGTCATGCATCCTGATCCATACTTCTGTTACCC---------------------------------------------------------------------------------------------CTGCCCCCTCCCGTGGACGGACTCCGGGTGGGTGTATCGCAGGCTGGGAGCATACTGAAGGTGGACACATG\\nContig0:13327-13845;size=1\\nCCAGCCAGGGCTTTGTCAAGCCTGACCTTAAAAACCTCTAAGGAAGGAGATTCCACCACCTCCCTAG----------------------------------------------------------------------------------------------------------------------TGAGAACAGTCTAGATCCAACCTCTTTGGAACCCCCTTTCAGGTAGTTCAAAGTAGCTATCAAAT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\n//\\n//\\nContig0:14393-14579;size=1\\nCATGCTGGGCCAGATCCACCCCCCTGGGGAAGCCCAGGGGATTGCACTAGGGATCAGGTTTGCCCCTT---------------------------------------------TTTAAATGAGTCCTTATTTGACTCTGGGCTCACCAGGCCACCTCCAAGCTGTTCCTTGGCTGGCTCCGTAATT\\n//\\n//\\nContig0:14907-15026;size=1\\nCATGATCTTGGACACATCTGTTCACCTCTCTTCTTCCCCTCCACGCCCCACCACCCAAAAAAATCTGTGAAATGGTAGGGTTGATGCTGCTTTACCTTCTGGGGGTGCTTGTGAGAATT\\n//\\n//\\nContig0:15377-15552;size=1\\nAATTGCCCTGCTTGCTACCCCTTAACACCAGCCCTGGGTTTTACATGCAGAAAACAGGTGTTGTGGCACAGG--------------------------------TGATCTCGGCAGGCTGCTGTTGGTCCATGTGCCATGGTCCAGCAGCACTCCTGGACTTGCAAACATTCATG\\n//\\n//\\nContig0:16736-16933;size=1\\nAAATTCTCAGGGCTTTGCTATGCAGGAGATCAGACTAGATTA------------------------------------------------------------------------------------GCAGGAAAATCTGTTACCTTTGTGTTTTCCCTGGGAGAGGTGGGGGCGCCGCTGGCTCCGACTCCGGCATG\\n//\\n//\\nContig0:19843-20061;size=1\\nCATGTGACTTTCCCTGACGGGGAAAACCACTGTGACCTGACCAGAGGGCCAAGCCATGAAACGGGAGCAGC-----------------------------------------------------------------------CAGCCAGAAGGAGGTGCTCCAGCTCTGAGTAGAGCTCCTCCAAGGATGAATAGAGATCTGGACTGGAAGCTAAATT\\nContig0:19843-20061;size=1\\nCATGTGACTTTCCCTGACGGGGAAAACCACTGTGACCTGACCAGAGGGCCAAGCCATGAAACGGG-----------------------------------------------------------------------------CAGCCAGAAGGAGGTGCTCCAGCTCTGAGTAGAGCTCCTCCATGGATGACTAGAGATCTGGACTGGAAGCTAAATT\\nContig0:19843-20061;size=1\\nCATGTGACTTTCCCTGACGGGGAAAACCACTGTGACCTGACCAGAGGGCCAAGCCATGAAACGGGAGCAGC-----------------------------------------------------------------------CAGCCAGAAGGAGGTGCTCCAGCTCTGAGTAGAGCTCCTCCATGGATGACTAGAGATCTGGACTGGAAGCTAAATT\\n//\\n//\\nContig0:22414-22496;size=1\\nAATTCTCGATCATCACGTGCACTGGGTGTGGCCGGATTTCCGAGATGGATGGAACTGGGTGAGCCAAATGAATCCCTCCATG\\n//\\n//\\nContig0:22928-23117;size=1\\nCATGGGCCAGGATTCCAAAGGTTAACACCCCGGCAGCAGAGGTGGGGAGCTGAGAGCACTCAGCACCTCGA--------------------------------------------GGACTGCACTCTATATCCACCCTTGAATAGGAACCTACATGAAATGGAGTCGCTCTCCAGATGTTTGGGGAATT\\n//\\n//\\nContig0:23425-23627;size=1\\nCATGCATAAGGTCTCTAAATCCCACCACTGTTCTGATCTTTGTTTTGTAAAGTGCCCATGAGCGTAGCATC-------------------------------------------------------------------------------------GGAGAGGGTACGCTGTATTCTTCAGACATAACGGCTCGATTAATT-\\nContig0:23425-23627;size=1\\nCATGCATAAGTTCTCTAAATCCCACCACTGTTCTGATCTTTGTTTTGTAAAGTGCCCATGAGCGTAGCATC------------------------------------------------------------------------------------------GGGTACGCTGTATTCTTCAGACATAACGGCTCGATTAATTA\\nContig0:23425-23627;size=1\\n--------------------------------------------------------CATGAGCGTAGCATCAGGCCCAAATGACAGGGCTGATATGCCACACACTGGTAAACTCTGTCCCATTTCTCATTCACTCTGCATTCAGTGGTGTAATGTGGGAGAGGGTACGCTGTATTCTTCAGACATAACGGCTCGATTAATT-\\n//\\n//\\nContig0:24391-24908;size=3\\n--------------------------AATTATTATGTACATAGTTTCGTCCTATTCAGTGTCTACTCAGCGCTTCTCGGCTTGTCTCTTGTATTCATTAAAT-------------------------------------------------------------------------------------------------------------------CGCTCACTGCTCAGCAGTTCGATGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCT----------------------------------------------------------------------TGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCGCC--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCT------------------------------------------------------------------------------------------------------------------------------------------------CGCTCACTACTCCACAGTTCAGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATCTATCTTTAGTGACATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACGTAGTTTAGTCCTATTCAATGTCTACTCAGCGCT----------------------------------------------------------------------TGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCAATGTTGCAATGTCCTGGTGAAATCGCTCGCCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCT--------------------------------------------------------------------------GGGCTCCATTTGCCCTGATAGTGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCACCGTGCTC--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCT---------------------------------------------------------------------TTGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTCCTGCAATGTCCTGGTGAAATCGCACGCCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\n-----------------------------------------AATTTAGTCCTATTCGGGGTCTACTCAGCGCTTCTTGGCTTGTCTCTTGTATTCATTAAATGGAGCATCTCTTGTC----------------------------------------------------------------------------------------------------CGCTCACCGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:24391-24908;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGCCGTCTTTTCCTTGCCATGCAGTGCATGGTCAAATGCATCGTCTCGAAGAAGTTCACCAATCTGAGGACCAACAAAGACACCTTCCTTTGTCTTAGCTTCACTTAACCTTGGAAATT\\n//\\n//\\nContig0:25290-25837;size=1\\nAATTGATGGCAAAACATTGCCATTATGCAGCAAAACAACTTTAAGACTCGTCTTCGATGAATCAATGAACAGTCTC----------------------------------------------------TGTTGCAGGCTACTAGATCACCTTCCATGAAGAAGAATGGGACAAGATCCTTTTGATGGTCACGGAACATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:25290-25837;size=1\\nAATTGATGGCAAAACATTGCCATTATGCAGCAAAACAGCTTTAAGACTTGTCTTCGATGAATCAATGAACAGTCTC----------------------------------------------------TATTGCAGGCTCCAAGATCACCTTCCATGAAGAAGAATGGGACAAGATCCTTTTGATGGTCACGGAACATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:25290-25837;size=1\\n----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTAATAAAGTTTGACTACATTTATTTCAGAAGCATTTTGGCTGTAGAGCAGCGAAATCAAAGCTGTGCCGATGC------------------------------------------------------------------------------------------AAGCAAATAGAGACATTTTCAGGTCTGCTGGCTGGTTAGGCTCCATAGTCCTACGCAAGGATGTGATCATG\\n//\\n//\\nContig0:28260-28458;size=1\\nCATGCGTCTGCAGAAAAGTACGGAGACTGGGCAAAGGCTGTGAGAAAGAAGAACGTTTGGGGCTTGTCCAA---------------------------------------------------ATCCAGTTCCTCTGCCTACTCTGGGACATTTATATCACAGAGTAGGCACTGTAATAAAAAGCAAGCAATGAGAATT\\n//\\n//\\nContig0:28945-29088;size=1\\nAATTCAAGAACTCAAAGGGACACTATCACCTTAAACTAGCAGGCACACACAAGAGGCAGTAGCTAGGTTGATGGGAGGATGCTCAGAGGTGTGGATTCTTTCATCGATCTAGCAGTGCCCACACTAGGGCTTAGGGTGGCATG\\n//\\n//\\nContig0:30099-30713;size=7\\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCAACTAGTCCAGATGGCTGCATTTCCCCATACACCCACTACTCCTGTTTGGAAGCCTCTCCATCACTTCCT---------------------------------------------------------------------------------------------------------------------------TGGTAGGGGCTGAAGGCTCTGGTCTGGGGTGGGCTTGATCTCTTGTACAGAAAACAGAGCTGAACCACATG\\nContig0:30099-30713;size=1\\nAATTAATGCTAGTCTTAATGAGAGTCTATTTCAACTCTTATGTAGCTGACAGGCTTTCTCAGGGCATGGCTGCTGT-------------------------------------------------------------------------------------------------TAGTTCCTTGAAGAATCAGGTTTTCCCTATTTTTTTTTAATCTCCTGTGTCAACTGGCGTCTTGGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:30099-30713;size=1\\nAATTAATGATAGTCTTAATGAGAGTCTATTTCAACTCTTATGTAGCTGACAGGCTTTCTCAGGGCATGGCTGCTGT------------------------------------------------------------------------------------------------TTAGTTCCTTGAAGAATCAGGTTTTCCCTATTTTTTTTTAATCTCCTGTGTCAACTGGCGTCTTGGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:30099-30713;size=1\\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCAACTAGTCCAGATGGCTGCAATTCCCCATACACCCACTACTCCTGTTTGGAAGCCTCTCCATCACTTCCT---------------------------------------------------------------------------------------------------------------------------TGGTAGGGGCTGAAGGCTCTGGTCTGGGGTGGGCTTGATCTCTTGTACAGAAAACAGAGCTGAACCACATG\\nContig0:30099-30713;size=1\\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCAACTAGTCCAGATGGCTGCATTTCCCCATACACCCACTACTCCTGTTTGGAAGCCTCTCCATCACTTCCT---------------------------------------------------------------------------------------------------------------------------------------------------TGGGGTGGGCTTGACCTCTTGTACAGAAAACAGAGCTGA--------\\n//\\n//\\nContig0:32872-33505;size=1\\nAATTCCTCTCCCTCCCTAACGTTAATCCCCCTGATATATTTATATAGAGCAAGCA---------------------------------------------------------------------------------CTCGGATCATCCTAGTAGCCCGTCTCTGAACCTGTTCCAGTTTGAATTC----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:32872-33505;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGGATCCACTCAAGCAGTGCAAAGCAGCCGGAAATCCAGCCAATCAAGTTTGTATAGGCAAAATCCCTG-----------------------------------------------------------------------------------------------------------------------------------GGAAGCTGTGAAAGCAGCAGACAGCCACCTTGCCCCCAGCCTTAAAATCTGGACTGAGCTGAGTTTGGATGCAATT\\n//\\n//\\nContig0:33785-34014;size=3\\nAATTAGTGCTAGGAGTCATTTTATTTAGCACCGTCATTAATGATTTAGGAAGGTGGAGCGTTAGGAGCCCATTAAT----------------------------------------------------------------------------------ATAGGCCTTCCCAAGCTTTGGCAACCACCTATTCCCATAGGTGCTACAGGGGGCAGAGCTCCGTGTGCATG\\nContig0:33785-34014;size=1\\nAATTAGTGCTAGGAGTCATTTTCTTTAGCAGCGTCATTAATGATTTAGGAAGGTG------------------------------------------------------------------------------------------------------------CCTTCCCAAGCTTTGGCAACCACCTATTCCCATAGGTGCTACAGGGGGCAGAGCTCCGTGTGCATG\\n//\\n//\\nContig0:36273-36404;size=1\\nAATTGCCCCACTTGCCTCCTTGCCGGGAGGCCCTGAAACACCCCCAGGGAAAAATAAAACTCAGCGCCTGTGATATGCAGTCTTAGATATAGAGGCAGGCTGAGGAGGGGCTGGGTGTGAGGGGGAGCATG\\n//\\n//\\nContig0:37976-38353;size=5\\nAATTAAATGCTACAGTGTTGAGCTGTTTTTTGAGGAAGAGCCTGTTCCTTGAAAGCAACTGGCCTTTGCTTTTCTT----------------------------------------------------------------------------------------------------------------------------GAGGCTGTGGGATCGGCAGCTGGAGTTCCCAGGGCTGCTTGCTGAACTTGTTTATTGTCGATTTTCCCATG----------------------------------------------------------------------------------------------------------\\nContig0:37976-38353;size=1\\n---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTTAAGTATCCTCACACCTCTTGTCTACTGTCTGAAATGGGCCGTCTTGNTTATCACTACAAAAGTTTTTTTTTTCTCCTGCTGATAATAGCTCATCTTAATTAATT\\n//\\n//\\nContig0:39644-39906;size=1\\nAATTTGGACATCACGATTCTCAGGCGGCAGGTTCATGCCATGGAACACCGATTCTGGGCCTTGGAAAC---------------------------------------------------------------------------------------------------------------------------GTGCAAGAATACCAAGATGAGAGCAGCCCTCACAGTTGAGAAGCAAGTGACGATAGCCCTGTGGAGGCATG\\n//\\n//\\nContig0:40396-40633;size=11\\nCATGCACAGGCAGCTTGGACAGTAGTCAGGAGCTGTTCAACTATAGACTGAGCAAGTGCAGAATGGTGGTA------------------------------------------------------------------------------------------GCTTGTTGTGTGCTCCACAATATCTGTGAGACTAAGGGGGAGATGTTTATGGCGGAGTGGGAGATTGAGGCAAATT\\nContig0:40396-40633;size=1\\nCATGCACAGGCAGCTTGGACAGTAGTCAGGAGCTGTTCAACTATAGGCTGAGCAAGTGCAGAATGGTGGTA------------------------------------------------------------------------------------------GCTTGTTGTGTGCTCCACAATATCTGTGAGACTAAGGGGGAGATGTTTATGGCGGAGTGGGAGATTGAGGCAAATT\\nContig0:40396-40633;size=1\\nCATGCACAGGCAGCTTGGACAGTAGTCAGGAGCTGTTCAACTATAGACTGAGCAAGTGCAGAATGGTG------------------------------------------------------------------------------------------------------------------------GAGACTAAGGGGGAGATGTTTATGGCGGAGTGGGAGATTGAGGCAAATT\\nContig0:40396-40633;size=1\\nCATGCACAGGCAGCTTGGACAGTAGTCAGGAGCTGTTCAACTATAGACTGAGCAAGTGCAGAATG------------------------------------------------------------------------------------------------GCTTGTTGTGTGCTCCACAATATCTGTGAGACTAAGGGGGAGATGTTTATGGCGGAGTGGGAGATTGAGGCAAATT\\n//\\n//\\nContig0:41620-42163;size=14\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGGCTAACAATAGGGATGATTTCTGTTCAGCCAAAGGTAAACAGCCCAGCAGGAACGGCCATCTCTG------------------------------------------------------------------------------------------------CATCCCCATACACATTAATAGACTTTTCCAGTAGCTGTACTGTCTGCCAATGCATCCCAAGTCTTCAGGGCAAATT\\nContig0:41620-42163;size=1\\nCATGGCATGTGTGCTTTCTTTACAAGATCGCATTTTGCCTCTTATATTGAGGGCCTGCTGGTTTGGCGTGAGAGATCACACACGCAGGGCTGGTGGGCAACAGAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:41620-42163;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGGCTAACAATAGGGATGATTTCTGTTCAGCCAAAGGTAAACAGCCCAGCAGGAACGGCCATCTCTG--------------------------------------------------------------------------------------------------TCCCCATACACATTAATAGACTTTTCCAGTAGCTGTACTGTCTGCCAATGCATCCCAAGTCTTCAGGGCAAATT\\nContig0:41620-42163;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGGCTAACAATAGGGATGATTTCTGTTCAGCCAAAGGTAAACAGCCCAGCAGGAACGGCCATCTC----------------------------------------------------------------------------------------------------TCCCCATACACATTAATAGACTTTTCCAGTAGCTGTACTGTCTGCCAATGCATCCCAAGTCTTCAGGGCAAATT\\nContig0:41620-42163;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGGCTAACAATAGGGATGATTTCTGTTCAGCCAAAGGTAAACAGCCCAGCAGGAACGGCCATCTCTG------------------------------------------------------------------------------------------------CATCCCCATACACATTAATGGACTTTTCCAGTAGCTGTACTGTCTGCCAATGCATCCCAAGTCTTCAGGGCAAATT\\n//\\n//\\nContig0:42764-42845;size=1\\nCATGGTCACCTGTGCTGATGAGCTCTGCATGGTCACCTGTGCTGATCAGCTTGCCACGCTGGTCAAACAGGAAATCAAATT\\n//\\n//\\nContig0:44568-44784;size=1\\nAATTTCCACAGGGCGGGAAAACCATTTTGTGCCCAGCACCCATTCAGGCAGCTGGTCACTGGGGTTGTCTGTGATG---------------------------------------------------------------------CCCTTTGATGACCATCCCCGCTGCTTAAGACCCAAAAGCAGAGCCTCGACCGCTGATGTTTCATGCACATG\\n//\\n//\\nContig0:48224-48506;size=2\\n-------------------------------------------------------------------CATGTCCAAATAATGTTGGACAACATAAGAAGGGCATTTTATATAAACAAGCAGGGGGGAGTGAGATCTCT--------------------------------------------------------------------ATATTTACCAGGGTTGCAAAACTCCCTAGCAGAGAGCCTGAGTGGATGTTTTTCCACAAATAGGAGATTCACAATT\\nContig0:48224-48506;size=1\\nTAATTTACTAGAACTCCAGGTGGTTCACTGAGCCTGCATCTCCTTCCCCCTGCTCATCCAATCCAGACATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\n//\\n//\\nContig0:49781-50026;size=1\\nAATTTGCTTTTGCGGGCCAGTCCCATCTGAGGGCTACTCAGTTCCTCTTATGTGGAAGGGGAAGTAGTGGTCTTGA--------------------------------------------------------------------------------------------------GGACACCGTACTGATGTGGGAGAGAAAAGAGAGCAAGCTTAGATTGCATGCACTTTGCAGCAGCACCCATG\\nContig0:49781-50026;size=1\\nAATTTGCTTTTGCGGGCCAGTCCCATCTGAGGGCTACTCAGTT--------------------------------------------------------------------------------------------------------------CGGCTTCGTGTTGAAGCACTCGGACACCGTACTGATGTGGGAGAGAAAAGAGAGCAAGCTTAGATTGCATG---------------------\\nContig0:49781-50026;size=1\\nAATTTGCTTTTGCGGGCCAGTCCCATCTAAGGGCTACTCAGTTCCTCTTATGTGGAAGGGGAAGTA------------------------------------------------------------------------------------------------------------------CGTACTGATGTGGGAGAGAAAAGAGAGCAAGCTTAGATTGCATGCACTTTGCAGCAGCACCCATG\\nContig0:49781-50026;size=1\\nAATTTGCTTTTGCGGGCCAGTCCCATCTGAGGGCTACTCAGTTCCTCTTATGTGGAAGGGGAAGTAGTGGTCTTGA-----------------------------------------------------------------------------CGGCTTCGTGTTGAAGCACTCGGACACCGTACTGATGTGGGAGAGAAAAGAGAGCAAGCTTAGATTGCATG---------------------\\n//\\n//\\nContig0:51214-51341;size=1\\nCATGCCAGGTCCCACTGGCGGAGGCAGGGTCACGCCCAGGCCATCCCTGGAAGTACCAGAATGTCCAGTATTCTGGGGATGCATTAGGGGTCACTTTACTCAAAAGTGGCAAAGCCTCTAGCGAATT\\n//\\n//\\nContig0:53474-53686;size=1\\nCATGTTAACCAGGCCGGGTGCCCCAGGCCTCAGGCATCCTGCCTCTTAACCAGGCCCAGATGCTCTTCTTT--------------------------------------------------------------------------------------------------AACTCGCTGTGTGACTTTGATCAAGTCACTTTGCCTTTCCATG\\nContig0:53474-53686;size=1\\nCATGTTAACCAGGCCGGGTGCCCCAGGCCTCAGGCATCCTGC---------------------------------------------------------------------------------------------------GAACATTTGGTTTCTCTTCTGCGCCAATAACTCGCTGTGTGACTTTGATCAAGTCACTTTGCCTTTCCATG\\n//\\n//\\nContig0:53920-54018;size=1\\nCATGATATGCAGTGGAGGGGAGAGGAACCCCTGGGCCTGGCAGTTCAGAGCCCTGGCACCTCTGGGCTTGCTGCAGCAGTTACAAATGTAAAAAAATT\\n//\\n//\\nContig0:56088-56333;size=10\\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCAAGAGATAGCCGAGATTGGAATCTG-------------------------------------------------------------------------------------------------TAATAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\\nContig0:56088-56333;size=1\\nTCATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCA-------------------------------------------------------------------------------------------------------------------------TAATAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\\nContig0:56088-56333;size=1\\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCAAGAGATAGCCGAGATTGGAATCTG----------------------------------------------------------------------------------------------------TAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\\nContig0:56088-56333;size=1\\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCAAGAGATAGCCGAGATTGGAATCTG--------------------------------------------------------------------------------------------------AATAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\\nContig0:56088-56333;size=1\\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGACTTAAATCAAGAGATAGCCGAGATTGGAATCTG-------------------------------------------------------------------------------------------------TAATAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\\nContig0:56088-56333;size=1\\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCAAGAGATAGCCGAGATTGGAATCTG--------------------------------------------------------------------------------------------------------------AGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\\n//\\n//\\nContig0:59313-59545;size=1\\nCATGGAGGGTGGGGGCTTCTGGTTGTAACTGGGGATGATCCCTCTGACCAGTTCCATGCCACTGTGG-----------------------------------------------------------------------------------------GCTTCCCCTGGTGTCAGGGCAAGCCCCCGAGAGGAACAATGACAAAGCATCACATACCGAGGTGGAATATTTAATT\\n//\\n//\\nContig0:62283-62553;size=3\\n-AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAGAGGT----------------------------------------------------------------------AGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG----------------------------------------------------\\nContig0:62283-62553;size=1\\nAAATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAA---------------------------------------------------------------------------------------------------AGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG----------------------------------------------------\\nContig0:62283-62553;size=1\\nAAATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAAC--------------------------------------------------------------------------------------------------AGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG----------------------------------------------------\\nContig0:62283-62553;size=1\\n-AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAGAGGT--------------------------------------------------------------------------------------------------------------------------GAGGACAGTGCTGACCATGCACATACTAACTAGTATGTTATAATGCCCTTATTTTGGGCTTTTCCTCCATG\\n//\\n//\\nContig0:63207-63392;size=1\\nCATGTTCAATGGTTTCACTTCAACGCAAACAGCCGTGGCTTCGATTGTGAACAGAAAATGTTGTGGCACAG--------------------------------------TGCCTCGCTGGTCTCTGAGGGCCCTGAAGAGCTTGCACGCCAAAGGACCAGATGGCTGGTGGGTTAAATGGAAATT\\nContig0:63207-63392;size=1\\nCATGTTCAATGGTTTCACTTCAACGCAAACAGCCGCGGCTTCGATTGTGAACAGAAAATGTTGTGGCACAG--------------------------------------TGCCTCGCTGGTCTCTGAGGGCCCTGAAGAGCTTGCACGCCAAAGGACCAGATGGCTGGTGGGTTAAATGGAAATT\\n//\\n//\\nContig0:64734-65000;size=12\\nAATTTGCTGCTAGGCCCAGACTAAAAGCAGGTAGCCACTGCGAATGCACAAGAGAGCCCAATGTAGCCCTGGAGAG-----------------------------------------------------------------------------------------------------------------------CTGTTCTCTGGCTTTGAACGACTGTGTGCTTTGGTGAAATACCGCTCAGAGCCAATGGGGTGGCATTCATG\\nContig0:64734-65000;size=1\\nAATTTGCTGCTAGGCCCAGACTAAAAGCAGGTAGCCACTGCGAATGCACAAGAGAGCCCAATGTAGCCCTGG---------------------------------------------------------------------------------------------------------------------------CTGTTCTCTGGCTTTGAACGACTGTGTGCTTTGGTGAAATACCGCTCAGAGCCAATGGGGTGGCATTCATG\\nContig0:64734-65000;size=1\\nAATTTGCTGCTAGGCCCAGACTAAAAGCAGGTAGCCACTGCGAATGCACAAGAGAGCCCAATGTAGCCCTGGAGAG-----------------------------------------------------------------------------------------------------------------------CTGTTCTCTGGCTTTGAACGACTGTGTGCTTTGGTGAAATGCCGCTCAGAGCCAATGGGGTGGCATTCATG\\nContig0:64734-65000;size=1\\nAATTTGCTGCTAGGCCCAGACTAAAAGCAGGGAGCCACTGCGAATGCACAAGAGAGCCCAATGT-----------------------------------------------------------------------------------------------------------------------------------CTGTTCTCTGGCTTTGAACGACTGTGTGCTTTGGTGAAATACCGCTCAGAGCCAATGGGGTGGCATTCATG\\n//\\n//\\nContig0:65302-65475;size=1\\nAATTAAAAACGAACCATAAACAACAACTGAGAAAGGATTCCAGATCGTCTTTTAAAAAAGGAAGCGGGTTCTCCGG--------------------------GAAAAATGCACAGTTGATAGGAAAGAGTGTTAGCTCCATCGTGAGCCCCTGACCCACAAGGGCTGTGCATG\\n//\\n//\\nContig0:67691-68021;size=1\\nCATGATCTTATGGTCACTCTACCATTCTCCCATGCACCTGGAAGAGAAAGACAGAAGCTGTGTTAACCGAC-------------------------------------------------------------------------------------------------TTAGCACCTTTCGCTCAAGGGCTTCTAATGCTCCCCTCACCTCCCCTCAATAATGATACTTAGCACTTGATGAATT--------------------------------------------------------------------------------------\\nContig0:67691-68021;size=1\\n------------------------------CATGCACCTGGAAGAGAAAGACAGAAGCTGTGTTAACCGACTGAACCCATGAGATGCTAACGAGGTCTCTC-------------------------------------------------------------------TTAGCACCTTTCGCTCAAGGGCTTCTAATGCTCCCCTCACCTCCCCTCAATAATGATACTTAGCACTTGATGAATT--------------------------------------------------------------------------------------\\nContig0:67691-68021;size=1\\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTAAATAAGCCTTACCTGCTGTGAAATGTCAATACTATTTTGAAGATGGGGAAACCGAGGCATG\\n//\\n//\\nContig0:68467-69007;size=9\\n----------------------------------------------------------------------------------------------------------------------------------AATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTGGTCCA------------------------------------------------------------------------------------------------TCCCCGGTCCCGCAGAGTCGGTGGCTTGCCCGTGGGAGGTCCCTGGTCCTGTGGATTCGGTGGCTTGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:68467-69007;size=1\\nAATTTTACCTGAAGTTCATTTCAGAAGGGGAAGCTAAACTGGGGCAAGTGGGTTTAAACTCATGTGACGGTTCCCACACACAAAGTTGCACCAGGGTACCCATG----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:68467-69007;size=1\\n----------------------------------------------------------------------------------------------------------------------------------AATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTGGTCCA-------------------------------------------------------------------------------------------------CCCCGGTCCCGCAGAGTCGGTGGCTTGCCCGTGGGAGGTCCCTGGTCCTGTGGATTCGGTGGCTTGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:68467-69007;size=1\\n----------------------------------------------------------------------------------------------------------------------------------AATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTG-----------------------------------------------------------------------------------------------------TCCCCGGTCCCGCAGAGTCGGTGGCTTGCCCGTGGGAGGTCCCTGGTCCTGTGGATTCGGTGGCTTGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:68467-69007;size=1\\n----------------------------------------------------------------------------------------------------------------------------------AATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTGGTCCA------------------------------------------------------------------------------------------------TCCCCGGTCCCGCAGAGTCGGTGGCTTGCCCGTGGGAGGTTCCTGGTCCTGTGGATTCGGTGGCTTGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:68467-69007;size=1\\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTGCTGCCGAATCCGCAGTACCAGCCGACCTCCCACAGGCAAGCCGTCGAAGGCTGCCTGTCTGCCGCCCTCGCAGAGACCGGCAGGGCTCCCCCTGAAGCTTGCCCTGCCCCAGGGACATG\\n//\\n//\\nContig0:69863-70116;size=19\\nAATTTGCTGGGTCCCTGGCCGGGGCAGGCCTGTAACTCAGCAGTCGGTTGTCTCATAGAATCTCAGGGTTGGAAGG----------------------------------------------------------------------------------------------------------TGGGTTTAGCAGGCTAATGCTCAGATCACTGAGCTATCCCTCCCCCCCACTTCACTACCCACTAGGCCATG\\nContig0:69863-70116;size=1\\nAATTTGCTGGGTCCCTGGCCGGGGCAGGCCTGTAACTCAGCAGTTGGTTGTCTCATAGAATCTCAGGGTTGGAAGG------------------------------------------------------------------------------------------------------------GGTTTAGCAGGCTAATGCTCAGATCACTGAGCTATCCCTCCCCCCCACTTCACTACCCACTAGGCCATG\\nContig0:69863-70116;size=1\\nAATTTGCTGGGTCCCTGGCCGGGGCAGGCCTGTAACTCAGCAGCCGGTTGTCTCATAGAATCTCA-----------------------------------------------------------------------------------------------------------------------GGTTTAGCAGGCTAATGCTCAGATCACTGAGCTATCCCTCCCCCCCACTTCACTACCCACTAGGCCATG\\n//\\n//\\nContig0:71142-71672;size=1\\nAATTAGGCTTGAATAAAAACTGGGAGTGGATGGGCCATTACACAAAGTAAAACTATTTCCCCATGTTTATTTTCCC-----------------------------------------------------------------------------CCTGCTGGTAATAGCTCACCTTAACTGATCACTCTCGTTAGAGTGTGGATGGTAACACCCATTGTTTCATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:71142-71672;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGTTAGTCTCTAAGGTGCCACAAGGACTCCTGTTCTTTTTGAGGATACAGACTAGCATGGCTGCTACTCTGA------------------------------------------------------GTGCTGAGAGTTCGATGACTCTGGTAGCTGGGGGGATGTTCACTGGGACCCCCCCGCTCCCAGGAGACATG\\n//\\n//\\nContig0:73757-73975;size=1\\nCATGGTCTGTTTGCTACCTTGGGGCTGGCCTCGCTGGATGGT---------------------------------------------------------------------------------------------------------GGTGCTCCAGAGACTGGTGCTGAGGATGCAGCAGGGGGTGCTCTCCCTTCTGCATCAGAACGTATCCCATG\\n//\\n//\\nContig0:74377-74795;size=1\\nAATTGGTATAAGAATCCAAGGATGGTAACAAGAGCCGAAAATGCCACCCAAGGACTTCGATTCACTGACCTTGTGG-------------GCTGGAAAATGGCAGACACACTTGTTCTTTCTTGTTAGCCTCTGGAGACGCTGATGGGGCCTGAGGACATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:74377-74795;size=1\\n----------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCAGGTCAAGGAGTTAGCACCGGACTGGCAGCCCATGACTGTCCCCTGATACCATCACAGGGCTGG----------------------------------------------------------------------------------------------------------------------TGGGCACCCACACAGATCTGCTGATGCCCGTCACTCCTCACCGACGGCCACCTCGAGGGCAGATTAACATG\\n//\\n//\\nContig0:75032-75494;size=2\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG---------------------------------------------------------------------------------AAAGCCCCTTGCTCCTGCTACCCAATCGCCTGCCACCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG--------------------------------------------------------------------------------------------------------------------CCCCTGCCACATTCAGAATCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG-------------------------------------------------------------------------------------CCCCTTGCTCCTGCTACCCAACCGCCTGCCCCCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG--------------------------------------------------------------------------------------------------------------------CCCCTGCCCCATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG-------------------------------------------------------------------------------------------------------------CCTGCCTCCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG--------------------------------------------------------------------------------------CCCTTGCTCCTGCTACACAATCGCCTGCCACCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG---------------------------------------------------------------------------------AAAGCCCCTTGCTCCTGCTCCCCAATCGCCTGCCACCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:75032-75494;size=1\\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGGAGATTGAGGGTTTTGAATATACAACTTCAGACTGTTAAGTGGGGTTATTGTAGCTTCCCTGCTCAATAGCTTCATACAGAATT\\n//\\n//\\nContig0:76480-77015;size=6\\n----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGACACACAGCTGTGTCTTTCTGATTCCATCCAATAGGGGGCAGCAGAGCATATAAACATTAACCAGTT------------------------------------------------------------------------------------ATATGCAGTTTTTCTTGTTCCCCTCCAGAGGGGTCAGTAATAAAACAATGGATATCATGGGATTAGAGCTGAAATT\\nContig0:76480-77015;size=4\\nCATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCGGCCATCTATCTACAATATGTATCTAAGC------------------------------------------------------------------------------------CTACACCTTCTTTTAACAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:76480-77015;size=2\\nCATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATC-GCCATCTATCTACAATATGTATCTAAGCA-----------------------------------------------------------------------------------CTACACCTTCTTTTAACAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:76480-77015;size=1\\nCATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATC-GCCATCTATCTACAATATGTATCTAAGCA-----------------------------------------------------------------------------------CTACACCTTCTTTTAACAGATCCCTGGGGACCCGGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:76480-77015;size=1\\n----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGAGAATCTCTATAAGCACTATAGGGCTCATGACACACAGCTGTGTCTTTCTGATTCCATCCAATAGGG------------------------------------------------------------------------------------------------------------------ATATGCAGTTTTTCTTGTTCCCCTCCAGAGGGGTCAGTAATAAAACAATGGATATCATGGGATTAGAGCTGAAATT\\nContig0:76480-77015;size=1\\n----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGACACACAGCTGTGTCTTTCTGATTCCATCCAATAGGGGGCAGCAGAGCATATAAACATTA-------------------------------------------------------------------------------------------------AGTTTTTCTTGTTCCCCTCCAGAGGGGTCAGTAATAAAACAATGGATATCATGGGATTAGAGCTGAAATT\\n//\\n//\\nContig0:79124-79379;size=1\\nCATGGACATAAGCCTTACGCCTCTCCTGGAAGTGGAGTTATGATGTCCGTGCAGTGGGGCGCTTACCTCG-------------------------------------------------------------------------------------------------------------GGTGCTGGGCAAATGGTCGAAGACTTGTTCCAACCTCTCCACTCTTGAGACCCAAACTTTGGGTCTGGTGGGAATT\\nContig0:79124-79379;size=1\\nCATGGACATAAGCCTTACGCCTCTCCTGGAAGTGGAGTTATGATGTCCGTGCAGTGGGGCGCGTACCTCGA--------------------------------------------------------------------------------------------------------------TGCTGGGCAAATGGTCGAAGACTGGTTCCAACCTCTCCACGCTTGAGACCCAAACTTTGGGTCTGGTGGGAATT\\nContig0:79124-79379;size=1\\nCATGGACATAAGCCTTACGCCTCTCCTGGAAGTGGAGTTATGATGTCCGTGCAGTGGGGCGCTTACCTCGA---------------------------------------------------------------------------------------------------------------------------------------------CCTCTCCACTCTTGAGACCCAAACTTTGGGTCTGGTGGGAATT\\n//\\n//\\nContig0:79959-80151;size=1\\nAATTTTTAAGCAACAGGATTTACTACCCGCTGTGGGAGGTGACAGGCTGATGGCTGTCTGGGAGACCCTCTCTGTC---------------------------------------------GTAGCCTGAACAGGCAGGATCCCACTAGTTGGCCATCTCCTCCATCCTCCCGCTAGCCACAGGTAGACATG\\nContig0:79959-80151;size=1\\nAATTTTTAAGCAACAGGATTTACTACCCGCTGTGGGAGGTGACAGGCTGATGGCTGTCTGGGAGACCCTCTCTGTC---------------------------------------------GTAGGCTGAACAGGCAGGATCCCACTAGTTGGCCATCTCCTCCATCCTCCCGCTAGCCACAGGTAGACATG\\n//\\n//\\nContig0:81339-81852;size=11\\nCATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGAC--------------------------------------------------------------------------------------------------TTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:81339-81852;size=6\\nCATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGAC--------------------------------------------------------------------------------------------------TTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGCTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:81339-81852;size=4\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGAAAAGTCAGGAAAAGAGCAGAGGCCGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCC-----------------------------------------------------------------------------------------------------------------------------GCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG\\nContig0:81339-81852;size=1\\nCATGCGTGGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGG------------------------------------------------------------------------------------------------------TATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:81339-81852;size=1\\nCATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGAT--------------------------------------------------------------------------------------------------------------------------TAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:81339-81852;size=1\\n-ATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGG----------------------------------------------------------------------------------------------------TTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:81339-81852;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGAAAAGTCAGGAAAAGAGCAGAGGCCGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTT-------------------------------------------------------------------------------------------------------------------------------------GCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG\\n//\\n//\\nContig0:82206-82699;size=4\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGA----------------------------------------------------------------------------------------GGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\\nContig0:82206-82699;size=1\\nCATGCAATGCAACAAGGTCTGGAGCTCCTTAGCGAGCCTTCGAGCCACCCAGTCCCTGAAATACACCCCCTGGTCCCTTTCAGGCTGGATCCAGAATGGAAAGTGTAACACCAATT-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:82206-82699;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTATCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGA----------------------------------------------------------------------------------------GGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\\nContig0:82206-82699;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGG-----------------------------------------------------------------------------------------GGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\\nContig0:82206-82699;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTTTCCCATACCTCCCGCCCCGCTCCTTTCCTCGCTTGATTTCTTCTTGAGGGAGGCAG-------------------------------------------------------------------------------------------TGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\\nContig0:82206-82699;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATATCTTCTTGAG---------------------------------------------------------------------------------------------------TGGCTCCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\\n//\\n//\\nContig0:83146-83367;size=1\\nAATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCAGTTTTGACGAACTGGTATTTGTTGCTG-----------------------------------------------------------------------------CCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATG\\nContig0:83146-83367;size=1\\nAATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCAGTTTTGACGAACTGGTATTTGTTGCTG--------------------------------------------------------------------------CACCCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATG\\n//\\n//\\nContig0:86329-86448;size=1\\nAATTTGGGGGCACTGCTTTTTGGTGCCCCCAAATCTCGGTGCCACCGCCTAGTTCACCTAGTGGTTACACCGGCCCTGAGCGTCTGGGGAGTCTTTCCTCCCGAAGGCCCAGCAGCATG\\n//\\n//\\nContig0:95645-95887;size=10\\nAATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTACGTGTAAAGGGCATCCTT-----------------------------------------------------------------------------------------------AAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATG\\nContig0:95645-95887;size=4\\nAATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTAGGTGTAAAGGGCATCCTT-----------------------------------------------------------------------------------------------AAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATG\\n//\\n//\\nContig0:98705-98903;size=1\\nCATGCAGTGTAGCCGCTGTTTGTTGGCAGTGCGACAAAAAACTTCCACCATCAATGAGTGGCGTTTACATT---------------------------------------------------TCGCGGTAAAACTTTTGTCTTTCGGGGTGGGGTGGGGATGTTTGGGTTAACACCACAAAAGTTTTGTCATTCAATT\\n//\\n//\\nContig0:99204-99415;size=1\\nCATGTGAGGGCAGGAGAAATGCCAAATCCAATCACTGCCTAGGGACAACTCTGACCTTTACCCCGAACAGC--------------------------------------------------------------------------------------------CCAGGTTTGATCCCTCACTGGTGAGGAGTCCTGCAGAGGTAGCACATG\\nContig0:99204-99415;size=1\\nCATGTGAGGGCAGGAGAAATGCCAAATCCAATCACTGCCTAGGGACAACTCTGACCTTTACCCCGAACAGC----------------------------------------------------------------------------------------------------------TCACTGGTGAGGAGTCCTGCAGAGGTAGCACATG\\n//\\n//\\nContig0:101553-101788;size=4\\n----CATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGC---------------------------------------------------------------------------------------CGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT\\nContig0:101553-101788;size=3\\nCATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGG----------------------------------------------------------------------------------------CGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT\\nContig0:101553-101788;size=1\\nCATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAG-----------------------------------------------------------------------------------------CGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT\\n//\\n//\\nContig0:102134-102336;size=1\\nAATTTGGCCTGTTCTCCTTGCACTAGAAGTTGATATTTCCCTGCACATACGCTGCTCTTGCACGGACTCTTCTGTT-------------------------------------------------------CTCTCCTCCTTGCTCTAACACTCAAGTAAAAAATCGCCATCCCACATTATACTCCCCAATCATCCCGCATG\\n//\\n//\\nContig0:103469-103709;size=16\\nCATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGC---------------------------------------------------------------------------------------------CCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT\\nContig0:103469-103709;size=1\\nCATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGC---------------------------------------------------------------------------------------------CCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCTGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT\\n//\\n//\\nContig0:104258-104379;size=1\\nAATTTTGAAAGATGTTACTATTTATTTAGTTTTTGGCTCGTTGTCGTTCCTGATTGACTTTGTTTTGCTGGAGCAGTTGGGAGGGATTATTTCACTCTTCTTGGCAGCTTCCACATACATG\\n//\\n//\\nContig0:104739-105191;size=1\\nAATTACCAAGGGTTCCAGGTGGGCTTGTACCACTCGTGCTTCAGCCCGTGCTGTTATAGCTTCGCTGCTTCTGTAC----------------------------------------------------------------------------------------------------------CTTAATGTGCTAGCGAAAGGGTTGATGAATT-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:104739-105191;size=1\\nAATTACCAAGGGTTCCAGGTGGGCTTGTACCACTCGTGCTTCAGCCCGTGCTGTTATAGCTTCGCTGCTTCTGTAC----------------------------------------------------------------------CGTCAACAAACCTTTAGTCTCAATCAGCGACCTCCACTTAATGTGCTAGCGAAAGGGTTGATGAATTCATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:104739-105191;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCATGTAACGATTAAAAAACGACAACAACAATAAACAGCACAAAAGTATATTACTTTAGTGTCCACCACAGCA------------------------------------------------------------------------------------------------AAAAAAGATACCCGGACGCCAGCAGTTTTCACCCTGTAACGGGAGAAAAGCCACTCTATGAAGTCCACATG\\n//\\n//\\nContig0:106701-106806;size=1\\nCATGGCCTTGGTGAGAGAGCTCTGTTCCAGAATCTGAGAGTCTCTTGTCACCACACACTTCTTCAGCTCCAGTATTTAGAGCCCCGGGCTGGGATTCACAAAATT\\n//\\n//\\nContig0:107637-108115;size=13\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAA------------------------------------------------------------------------------------------------AGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT\\nContig0:107637-108115;size=1\\nCATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATC----------------------------------------------------------------------------------GGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATT---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:107637-108115;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAA--------------------------------------------------------------------------------------------------------AGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGTCAGACTGAAAAATT\\n//\\n//\\nContig0:110181-110429;size=3\\nAATTGCATTTGTAAACCATTTCACGACTTTGATGCACGATTTGCATGCATGATTTATCCCGGTCATATGAGATGAT-----------------------------------------------------------------------------------------------------ACAACAAAAAACAGCACCCCGAGCTCAGTGCCCCCCAACCTGGCAGCCCAGGTGGTTGCCTGGGTCACATG\\n//\\n//\\nContig0:111104-111195;size=1\\nCATGCAAGCAGACCAGAGCTGCCCCAAAGCACTGACGGTGGCCCCTTGACCTGTTTGAAATGCTACATTTGCCCCAAGCTGCAAGTGAATT\\n//\\n//\\nContig0:111481-111551;size=1\\nAATTAAAAATAGGCTGCGGCAAGAATGGATCCTCTGAACAGCGAAGGGAGGTACGTGGCAGGCTCCATGC\\n//\\n//\\nContig0:112880-113019;size=1\\nCATGCTGCTGCTTCCAGGAGCTGCCTGAGGTAAGCGCTGCCCAGAGCCTGCAACTCGGCTATCTCAAAAACTGACTGCGACAGTGAAAAATGAACAGTTCAAGGCTGAGAGCATCTGGTGATAAACTAACAAAAAAATT\\n//\\n//\\nContig0:113494-113752;size=1\\nAATTTCCATACCCAGATGTGTCCACGCCTGGACTCTACAGCATGCCGCAATCACATCTTGATAGCTGTGTCTACAC---------------------------------------------------------------------------------------------------------------ACAGGATTGTACTTGGGTGCTCAGCAACCCCAACCAGGGGCGGCTCTAGACATTTTGCCACCCCAAGCATG\\n//\\n//\\nContig0:120824-121022;size=1\\nAATTAGAGCAACCTCGAGGCTGCTCTACCTTACAGTAGGAGCAGGATAGACTCCCGGATAGTTCCAGAATATGATC---------------------------------------------------CACCAAAAGCTGCTGGGCTGGGACAGAGAACTGAGGTCTGTTTCTCTCAAGAAAGAACCAGGTTTCACATG\\nContig0:120824-121022;size=1\\nAATTAGAGCAACCTCGAGGCTGCTCTACCTTACAGTAGGAGCAGGATAGACTCCCGGATAGTTTCAGAATATGATC---------------------------------------------------CACCAAAAGCTGCTGGGCTGGGACAGAGAACTGAGGTCTGTTTCTCTCAAGAAAGAACCAGGTTTCACATG\\n//\\n//\\nContig0:122245-122688;size=1\\nAATTCCTCTAAAGCCGGTGTTCTGGGGGACGGACTGACTCAGGGGTACGGGTAAGGGAATAGATCACAGATCACTT-------GGTCCAGGATTGTACTTCCCAAATATGACGACTATATTGTACCCTTGCCTGTGCAAAAGGAGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:122245-122688;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGCATCGACTTCAAGAAACGGTCCCTCCGGGGCAAGGCAGAGGCAGGCT------------------------------------------------------------------AGTAGGATGGCCAAAGGGTTATACCCTTCAGATCTCCTTTACAGGTGCCATCTTCCGAGTCCGACTGCATG\\nContig0:122245-122688;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGCATCGACTTCAAGAAACGGTCCCTCCGGGGCAAGGCAGAGGCAGGCTTGTAGGGTATGGGATGGGGGAAA-------------------------------------------AGTAGGATGGCCAAAGGGTTATACCCTTCAGATCTCCTTTACAGGTGCCATCTTCCGAGTCCGACTGCATG\\n//\\n//\\nContig0:124322-124627;size=1\\nAATTACCCACCTTTCAGAAGGCTGATTAATGACATGGCTCTGCAGCATGCACAGTAGCTCAACCTCCTGCATGAAA-----------------------------------------------------------------------CAAGCTATTCTTTCCCAGATCTGGCGTAGGAAGTGCGGCGCTTCCAGCGGCCAGGGAAGATCCCTGCCATG---------------------------------------------------------------------------------------\\nContig0:124322-124627;size=1\\n---------------------------------------------CATGCACAGTAGCTCAACCTCCTGCATGAAACACACTCATCTCTCTCTCTCCTGAGATTGACAGGTCACGG-----------------------------------------------------------------------------------------------------------------GGCTTTGTAGCAAACTCAGCAGAAGCGGCTTCTCACAGAGTCAATCCTGGATACAGAAGATTTGCCATAGCAAATT\\n//\\n//\\nContig0:127068-127278;size=1\\nCATGATCCTGGGAGCAGCCAGTGAAGAAAGCGAAATGCCATTATAGTGGG-----------------------------------------------------------------------------------------AGGTGGGGCGAGCTGAGGACATCTTCCCAGACCATCTCTGAGCAGCACAGAGGGTGGATCCCAGCTCCATG\\n//\\n//\\nContig0:128188-128337;size=1\\nCATGGCTTCCTCTTTTATAAGGCTCAGGTGCCTTCCCTTAAGCCCAGTCGGGCAGCAGCTAATCAGACACA--TGCCCTGGACGCCCCCCCTCCCTTCTGCTTCCTCCCTTAAAGGGGCCAGTCACCCTGTGACAGCAACATTTCAATT\\n//\\n//\\nContig0:129266-129465;size=1\\nAATTTACAAAATATGAAAGGAAACACTTCTTCACATAGCCTGTGGTACGCACTGCCCAGAGGGGCCAGAGAGTCCA------------------------------------------------------------------------------------CCTCAAGCTTCAGGACATCAACTGGTCAGGAAGGAATTT\\n//\\n//\\nContig0:132993-133221;size=2\\nAATTAATTCTGTTTTGTGTGTGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGAT---------------------------------------------------------------------------------ATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCTGTAATCAAAACCAGCATG\\nContig0:132993-133221;size=1\\n----AATTCTGTTTTGTGTGTGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGATGGTC-----------------------------------------------------------------------------ATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCTGTAATCAAAACCAGCATG\\n//\\n//\\nContig0:135395-135645;size=11\\nAATTACCTGGGACTCCTGCCTGGCTGGGGAGATCTGCTAGGCAATCAGATCCCACCTTCAGCACAGGGTATTCTAA-------------------------------------------------------------------------------------------------------CCGGAGGTGGCAAGCTTTACCCACCCCGTAATCAGCGCTGCGGCAAGCTGCATTATACTCCCAGAATCATG\\nContig0:135395-135645;size=1\\nAATTACCTGGGACTCCTGCCTGGCTGGGGAGATCTGCTAGGCAATCAGATCCCACCTTCAGCACAGGGTATT-----------------------------------------------------------------------------------------------------------CCGGAGGTGGCAAGCTTTACCCACCCCGTAATCAGCGCTGCGGCAAGCTGCATTATACTCCCAGAATCATG\\nContig0:135395-135645;size=1\\nAATTACCTGGGACTCCTGCCTGGCTGGGGAGATCTGCTAGGCAATCAGATCCCACCTTCAGCACAGGGTATTCTAA---------------------------------------------------------------------------------------------------------------------------------CGTAATCAGCGCTGCGGCAAGCTGCATTATCCTCCCAGAATCATG\\nContig0:135395-135645;size=1\\nAATTACCTGGGACTCCTGCCTGGCTGGGGAGATCTGCTAGGCAATCAGATCCCACCTTCAGCACAGGG-----------------------------------------------------------------------------------------------------------------GGAGGTCGCAAGCTTTACCCACCCCGTAATCAGCGCTGCGGCAAGCTGCATTATACTCCCAGAATCATG\\n//\\n//\\nContig0:137333-137570;size=2\\nCATGGTGTTCTATCTGCTGAGCGTGTTCTGAGGCTTTTCAACCTAAAAAGTAAGTAAAACTGGAAATG---------------------------------------------------------------------------------------------ACCTGCGTATTTTACCCTTCTCTCCCCAAAATCACGTCTGAACAAAACCCATGGAATGCTCAATACATAGTCAATT\\n//\\n//\\nContig0:137831-137897;size=1\\nCATGTTTCCTAGCAACCTTAAACTTCAAGTTGACTAAGTTTGTTGTCTGGAAACCTACTATAAATT\\n//\\n//\\nContig0:139289-139543;size=1\\nAATTGAACCTGGCTTGACTGAGTTCTAGGCTCATGCCCCAGCCTCTGCACTGTGCTCCTTCTTACCAAGAGCCATC-----------------------------------------------------------------------------------------------------------TACGGTCATGTCGTACGTAGGGCTCTGTCCTGCTCCCACTGCAGCCAGTGGCAAACTCCCGTCACTTCATG\\n//\\n//\\nContig0:140146-140422;size=1\\nAATTCATTTGAATATAAATACTGTACTTACAAGTCAGTGTGTAGAGCAGTATAAACAGCTCATTGTCTGTATGAAG---------------------------------------------------------------------------------------------------------------------------------TGAAGAACCACTGCTTTAAAGGGCAGTGTGGGAGGTTGGAAAGTTGTATGATTGTTAGGAGGGACAGAGCA\\n//\\n//\\nContig0:141889-142575;size=3\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGGGATCTGAAAGGCTGTGGCAGCCAGAGAAAGAGGCGACTTTCCCCAGCTCCAGGGCTGCAGCTGCTGGGG---------------------------------------------------------------------------------------------------GAGGCATGGAAGAGAGGGCACATCCATTGCATTAGAAAGGCAAAACTACTGATATGAAAATATGAGTCATG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:141889-142575;size=1\\nCATGCTCCATTCAGGTGGNCCGCGGACAGTTCCCTCTAAGACACGCGCCTGTGTGGCTGCACACGAGAGAATAAAGGACCACCCACTTAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:141889-142575;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGGGATCTGAAAGGCTGTGGCAGCCAGAGAAAGAGGCGACTTTCCCCA---------------------------------------------------------------------------------------------------------------------------GAGGCATGGAAGAGAGGGCACATCCATTGCATTAGAAAGGCAAAACTACTGATATGAAAATATGAGTCATG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:141889-142575;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGGGATCTGAAAGGCTGTGGCAGCCAGAGAAAGAAGCGACTTTCCCCAGCTCCAGGGCTGCAGCTGCTGGGG---------------------------------------------------------------------------------------------------GAGGCATGGAAGAGAGGGCACATCCATTGCATTAGAAAGGCAAAACTACTGATATGAAAATATGAGTCATG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:141889-142575;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGGGATCTGAAAGGCTGTGGCAGCCAGAGAAAGAGGCGACTTTCCCCAGCTCCAGGGCTGCAGCTGCTGGGG---------------------------------------------------------------------------------------------------GAGGCATGGAAGAGAGGGCACATCCATTGCATTAGAAACGCAAAACTACTGATATGAAAATATGAGTCATG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:141889-142575;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTTCAAGTGGTCTGTGGAAAAAAGAAACTGAGAACCAGTGCATTAGAGGAAGACATAGAAGAAGGTTCAGAGTTTGGAGCAGAAGACGATTCCATGCATG\\n//\\n//\\nContig0:143165-143277;size=1\\nCATGGCTGGTGGAGGATGCTCAGTAACACCTAATACCTCTGCTGGCAGATATTGTTAACCCCATCTTTAGAGAGGGAAATCTGCCCTAAAGCCTGCCACATTTTGACCAATT\\n//\\n//\\nContig0:143580-143810;size=8\\nAATTTTTCTTCTGGTTTGTCACAGTTTGCTAATATTCCTGCTCAGAGGGAAAAAAAAAGTCATACTTTAGACTTTG-----------------------------------------------------------------------------------CTTCCAACTCTTACCTGCTGCTAATCTTTGTTCAAACGTTGATGGCTCCAGGACGTGCCTTGGCCAACATG\\nContig0:143580-143810;size=1\\nAATTTTTCTTCTGGTTTGTCACAGTTTGCTAATATTCCTGCTCAGAGGGAAAAAAAA-------------------------------------------------------------------------------------------------------------------------GCTAATCTTTGTTCAAACGTTGATGGCTCCAGGACGTGCCTTGGCCAACATG\\nContig0:143580-143810;size=1\\nAATTTTTCTTCTGGTTTGTCACAGTTTGCTAATATTCCTGCTCAGAGGGAAAAAAAAAGTCATACTTTAGACTTTG------------------------------------------------------------------------------------------------CCTGCTGCTAATCTTTGTTCAAACGTTGATGTCTCCAGGACGTGCCTTGGCCAACATG\\nContig0:143580-143810;size=1\\nAATTTTTCTTCTGGTTTGTCACAGTTTGCTAATATTCCTGCTCAGAGGGAAAAAAAAAGCCATACTTTAGACTTT------------------------------------------------------------------------------------CTTCCAACTCTTACCTGCTGCTAATCTTTGTTCAAACGTTGATGGCTCCAGGACGTGCCTTGGCCAACATG\\n//\\n//\\nContig0:145164-145418;size=5\\nAATTTAACGTCCATTAGACATTCTGAAGGGGTAACGCTGAGTTAGACATTGGGTGCTGGTATTGTGATGGGGAAAG-----------------------------------------------------------------------------------------------------------CATCTGACAAATGGTCCCAGTGCCCTCCGAGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\\nContig0:145164-145418;size=1\\nAATTTAACGTCCATTAGACATTCTGAAGGGGAAACGCTGAGTTAGACATTGGGTGCTGGTATTGTGAT------------------------------------------------------------------------------------------------------------------------GACAAATGGTCCCAGTGCCCTCCGAGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\\nContig0:145164-145418;size=1\\nAATTTAACGTCAATTAGACATTCTGAAGGGGTAACGCTGAGTTAGACATTGGGCGCTGGT---------------------------------------------------------------------------------------------------------------------------CATCTGACAAATGGTCCCAGTGCCATCCGAGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\\nContig0:145164-145418;size=1\\nAATTTAACGTCCATTAGACATTCTGAAGGGGTAACGCTGAGTTAGACATTGGGTGCTGGTATTGTGATGGGGAAAG----------------------------------------------------------------------------------------------------------------GACAAATGGTCCCAGTGCCCTCCGAGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\\nContig0:145164-145418;size=1\\nAATTTAACGTCCATTAGACATTCTGAAGGGGTAACGCTGAGTTAGACATTGGGTGCTGGTATTGTGATGGGGAAAG-----------------------------------------------------------------------------------------------------------CATCTGACAAATGGTCCCAGTGCCCTCCGGGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\\n//\\n//\\nContig0:145642-145890;size=1\\nCATGGGCAGCGGACGTGTGTTGGCGTGGCTCTCTGCTGTGTACCAGGCCTTATTAACACAAAACGGTACGC-----------------------------------------------------------------------------------------------------TGTCAAATAGCAACAGGGCTGACCCAAAACCCATGTCCAAACATCCAAGAACTTCACAAAAGTTCAACTCTCAATT\\nContig0:145642-145890;size=1\\nCATGGGCAGCGGACGTGTGTTGGCGTGGCTCTCTGCTGTGTACCAGGCCTTATTAACACACAACGGTACGC-----------------------------------------------------------------------------------------------------TGTCAAATAGCAACAGGGCTGACCCAAAACCCATGTCCAAACATCCAAGAACTTCACAAAAGTTCAACTCTCAATT\\nContig0:145642-145890;size=1\\nCATGGGCAGCGGACGTGTGTTGGCGTGGCTCTCTGCTGTGTACCAGGCCTTATTAACACAAAACGGTACGC--------------------------------------------------------------------------------------------------------CAAATAGCAACAGGGCTGACCCAAAACCCATGTCCAAACATCCAAGAACTTCACAAAAGTTCAACTCTCAATT\\nContig0:145642-145890;size=1\\nCATGGGCAGCGGACGTGTGTTGGCGTGGCTCTCTGCTGTGTACCAGGCCTTATTAACACAAAACGGTACGC-------------------------------------------------------------------------------------------------AAGGTGTCAAATAGCAACAGGGCTGACCCAAAACCCATG-----------------------------------------\\n//\\n//\\nContig0:146117-146349;size=1\\nAATTAAACACGGGGTAAGCCACTGCCATGCTCTGACCCAAAGGTGGCTGGTGTATTTTCAGTAGCGGGAATGCACG-------------------------------------------------------------------------------------CCTAGAGCCCTGGGCCAGAGCAAGTGAGGGCCCCCCCACCCCCTCCAGTCCTGCCTCCAGCCCTGTTCATG\\nContig0:146117-146349;size=1\\nAATTAAACACGGGGTAAGCCACTGCCATGCTCTGACCCAAAGGTGGCTGGGGTATTTTCAGTAGCGGGAATGCACG-------------------------------------------------------------------------------------CCTAGAGCCCTGGGCCAGAGCAAGTGAGGGCCCCCCCACCCCCTCCAGTCCTGCCTCCAGCCCTGTTCATG\\n//\\n//\\nContig0:148378-148953;size=2\\nCATGCTGCAGCTAGGGGGAAATGGGGGCAGGGCCGGGGGAACCTCACCCTCCCCAGCTGCGAATCCTGGGA---------CGATCCGACCCGCGGACTGGAGTTCTTTACCCACCCCCTGGCCCTTTAACAACCGGTTCTCCACGGAGGTCTAATT-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:148378-148953;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTCCTGGTTCACTTCTTCGTCTCTTATAACTTTTGTATTGAGTGCAATACATAGGGCTTGCAAAAATG---------------------------------------------------------------------TCTGGCTGTCCTGTGGAGATTTACAAACCAACCTGCTTCATTGACAGTGCAGTGGGGTTTTCTTCTGTTTCAAATT\\nContig0:148378-148953;size=1\\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTCCTGGTTCACTTCTTCGTCTCTTATAACTTTTGTATTGAGTGCAATACATAGGGCTTGCAAAAATG------------------------------------------------------------------------------------------------------------ATTGACAGTGCAGTGGGGTTTTCTTCTGTTTCAAATT\\n//\\n//\\nContig0:149397-149596;size=2\\nAATTTGAGTTCAGCTGTATTTGCAAAGAGGTGAAACACTCCAAACCAAAATGAAACATTTCGTTTGACAGAACTGT----------------------------------------------------CTCAGCCAACCAACTGAAAAATCTGTTATTTTAACAGCTCTACCCAGGAAGCAAGAGCCAGAGCAGGCATG\\n//\\n//\\nContig0:150178-150481;size=1\\nAATTCTTCGGAAGAGCCTTTTTTTCTAACCTGTGATGCAAGCTGGTCCATTTCAAACACCAGGTATCTCCAAAACAATCCCCACAGGCAGCTACCCCCCTCCAAGATCTCCAAGCACCACTTCAAATAGCGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:150178-150481;size=1\\nAATTCTTCGGAAGAGCCTTTTTTTCTAACCTGTGATGCAAGCTGG-----------------------CCAAAACAATCCCCACAGGCAGCTACCCCCCTCCAAGATCTCCAAGCACCACTTCAAATAGCGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:150178-150481;size=1\\n------------------------------------------------------------------------------------------------------------------------------------CATGTGGAATCCTGTGCTATCAATCTGCTCCTCAAAAGAGCCCCCATGAACAAAGCAGAGATGTTCCTAGC------------------------TAGCAAATCCCAGCCTGGCCCTGGGTTCAGACAAGCTCAAACCAGCACAATACAAAGAGAAGAGCAAAGTCTAATT\\n//\\n//\\nContig0:150767-151006;size=17\\nAATTTGATACCATATTCTTCTCTATTTTCTGTCCTCTGTGCACCAAAATCCCACCAAAGCAGGATGTTTGGGGGGT--------------------------------------------------------------------------------------------GTAGATGTGTTTTGGTTGCAGGGAGGCATTCTCTGGGGACATACACATAGGAATGCCTGTATGTCTACATG\\nContig0:150767-151006;size=1\\nAATTTGATACCATATTCTTCTCTATTTTCTGTCCTCTGTGCACCAAAATCCCACCAAAGCAGGATGTTTGGGGGG---------------------------------------------------------------------------------------------GTAGATGTGTTTTGGTTGCAGGGAGGCATTCTCTGGGGACATACACATAGGAATGCCTGTATGTCTACATG\\n//\\n//\\nContig0:151911-152069;size=1\\nAAATTGCATCGAAGAGCACAGAGGCAGGG----------------------------------------------------------TAAACCTACGCTAATGAACACAAATATAGAAGAGAAGGATCACCCAGCAGTGTTAAGGAAGCCGCCGCATG\\n//\\n//\\nContig0:152674-152906;size=7\\nAATTTCCCAGCTTCTGGGGAGTCCTGGAGAGGAGGAAGAATCCGGATCCCTTTGCAGACGGCATTGCTGCTGCACT----------------------------------------------------------------------------------------CTCTGAAGAAGGGCCAGGGGTCAGGGGTGAAAGTATGGGCTTCGTTGTGAGGAAGTGTCCCTGCCATG\\nContig0:152674-152906;size=1\\nAATTTCCCAGCTTCTGGGGAGTCCTGGAGAGGAGGAAGAATCCGGATCCCTTTGCAGACGGCATTGCTGCTGCACT----------------------------------------------------------------------------------------CTCTGAAGAAGGGCAAGGGGTCAGGGGTGAAAGTATGGGCTTCGTTGTGAGGAAGTGTCCCTGCCATG\\n//\\n//\\nContig0:153152-153312;size=1\\nCATGTACGTCGCACGACTGAATCAATGGGGCTCCTCACCCACTTAAAGGGAATCGTGGGGTTCAGTGCTTT-------------AGGTTTGTTTTGCCACCTCCTGGACTGGCACCTACGAGTCTAAGGCTAAATATTATTTCTAGTAGGGCTGCCAATT\\n//\\n//\\nContig0:153761-154003;size=17\\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA-----------------------------------------------------------------------------------------------CTTTTTAAACCCAGGTGCCCTGATTAGCCTGCTTTGATTGGCTGGAGGTGATCTAATCAGCCTGTCTGCCTTAATT\\nContig0:153761-154003;size=1\\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA-----------------------------------------------------------------------------------------------CTTTTTAAACCCAGGTGCCCTGATGAGCCTGCTTTGATTGGCTGGAGGTGATCTAATCAGCCTGTCTGCCTTAATT\\nContig0:153761-154003;size=1\\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGG-------------------------------------------------------------------------------------------------------------------------------------------------TTTGATTGGCTGGAGGGGATCTAATCAGCCTGTCTGCCTTAATT\\nContig0:153761-154003;size=1\\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA------------------------------------------------------------------------------------------------TTTTTAAACCCAGGTGCCCTGATTAGCCTGCTTTGATTGGCTGGAGGTGATCTAATCAGCCTGTCTGCCTTAATT\\nContig0:153761-154003;size=1\\nCATGGCCTCCTCCAAACACCTTCTTTATCCTCACCACAGGACCTTCCTCCTGGTGTCTGAT-------------------------------------------------------------------------------------------------------------------------------------------TGATTAGCTGGAGGTGATCTAATCAGCCGGTGTGCTTTAATT\\nContig0:153761-154003;size=1\\nCATAGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA-----------------------------------------------------------------------------------------------CTTTTTAAACCCAGGTGCCCTGATTAGCCTGCTTTGATTGGCTGGAGGTGATCTAATCAGCCTGTCTGCCTTAATT\\nContig0:153761-154003;size=1\\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA--------------------------------------------------------------------------------------------------TTTAAACCCAGGTGCCCTGATTACCCTGCTTTGATTGGCTGCAGGTGTTCTAATCAGCCAGTCTGCCTTAATT\\n//\\n//\\nContig0:155102-155187;size=1\\nAATTCAGTTCCCCATCACCTGGGGCCAGGACTCTTCTGGGCAGCTTCAGCAGGGACGGGAGGCCAGAGACTGTGTGTGGTCCATG\\n//\\n//\\nContig0:157369-157580;size=1\\nAATTCCTGGCTCTGCTACCGTCTAGCTGTGTGATGGTGCA-----------------------------------------------------------------------------------------------TCCTCTGATGAGAGATGCTATTTCTTAGCTCCTACATATTAGCCAAAGGCTATTTAACCCACTTTCCAGTGTAATT\\n//\\n//\\nContig0:158525-158758;size=7\\nAATTAGATCTTGTGCACAATGAGACAGGACGAGAACCTGGGCCTTTCCAGTCCAGAGACACAAGCCTCCACCACCA--------------------------------------------------------------------------------------ACCCCAGCAGACACGCAGGGCTTTGCTGCAACTGCGCCCCTTGGCAGCTGACATTGCTTATGTCTAACATG\\nContig0:158525-158758;size=3\\nAATTAGATCTTGTGCACAATGAGACAGGACGAGAACCTGGGCCTTTCCAGTCCAGAGACACAAGCCTCCACCACCA---------------------------------------------------------------------------------------CCCCAGCAGACACGCAGGGCTTTGCTGCAACTGCGCCCCTTGGCAGCTGACATTGCTTATGTCTAACATG\\nContig0:158525-158758;size=1\\nAATTAGATCTTGTGCACAATTAGACAGGACGAGAACCTGGGCCTTTCCAGTCCAGAGACACA----------------------------------------------------------------------------------------------------ACCCCAGCAGACACGCAGGGCTTTGCTGCAACTGCGCCCCTTGGCAGCTGACATTGCTTATGTCTAACATG\\n//\\n//\\nContig0:160325-160508;size=1\\nAATTTGCAAGGCAGGGAGCT--CAGTGTCTGCTCCAAAAATCCGCGCTCTCTGTCTCCCCGATGCTCCCTGTCACACT-----------------------------------------------------TAGCTGCCCACAATGCACCACTCCCAACAGCGCTGCAAATGTGGCCACACTT\\n//\\n//\\nContig0:161569-161784;size=1\\nAATTGAAATAGCAAGGAGGGTGCTCAGTGACGGTGGGCATGATATAAAAACCCAAACAGATCGGAAGTCAGTGGCA--------------------------------------------------------------------CCTGCATCAAAAAGAAGAGACTTCCTGAAATAGACAAGTGTCTCAGCAATGAGACAGGCTCAGAGAACATG\\n//\\n//\\nContig0:163234-163571;size=1\\nAATTCTGGCTCATTGAACTTCATTTACTCATTGCTGCAATCCCAAGCCTTCAAAAATCATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\\nContig0:163234-163571;size=1\\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGGCTGCAGTTTAACTTTTTTGTTTTGAAAGTAAAGCAGTGATTCTCCTGCAATCTTGTGGGGCCTTAAGACAGACATCAAACATCCTGAGATTCGCAATAAGAACTCAATT\\n//\\n//\\nContig0:165831-166037;size=1\\nAATTTCCTGTCCTCCTTCTACCCAAAGCTTCCCCTCCCCACAACGTGCCTCAGGCCCAAAGAATGACCTCGGAAGC-----------------------------------------------------------CCGTGGACTGCCCCAGTGGAGGCATGACAAGATGGACCGTGCGAGTTCCCCCTTTTCTGTGCTGCCCCATG\\nContig0:165831-166037;size=1\\nAATTTCCTGTCCTCCTTCTACCCAAAGCTTCCCCTCCCCACAACGTGCCTCAGGCCCAAAGAATGACCTCGGAAGCATG-------------------------------------------------------------------------------------------------------------------------------\\n//\\n//\\nContig0:169197-169391;size=1\\nAATTAGCAAGGAGGTTTGTCCTCTTGCCCCCCAGGGGCATCTTTGTTGGGGCTGGTTCAGACTGATTGCATAA------------------------------------------------------CCCAGTGACCTCCAAGCACTTTACAAACCTGCCTCAGGGGCCTTACACAGGAGGAAAAGGAGACATG\\nContig0:169197-169391;size=1\\nAATTAGCAAGGAGGTTTGTCCTCTTGCCCCCCAGGGGCATCTTTGTTGGGGCTGGTTCAGACTGATTGCATAAGCT-----------------------------------------------TTCACCCAGTGACCTCCAAGCACTTTACAAACCTGCCTCAGGGGCCTTACACAGGAGGAAAAGGAGACATG'"
]
},
"execution_count": 547,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\\n//\\n//\\n\".join(c).encode()\n"
]
},
{
"cell_type": "code",
"execution_count": 397,
"metadata": {},
"outputs": [],
"source": [
"regions = (i.split(\"\\t\") for i in fullregions)\n",
"regions = ((i, int(j), int(k)) for (i, j, k) in regions)\n",
"build_ref_cigars(data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 438,
"metadata": {},
"outputs": [],
"source": [
"reg = ('Contig0', 41920, 42479)\n",
"reg = ('Contig0', 7219468, 7220326)\n",
"reg = ('Contig0', 207998, 208219)\n",
"reg = ('Contig0', 16738, 16933)\n",
"reg = ('Contig0', 49781, 50005)\n",
"reg = ('Contig0', 76480, 76711)\n",
"reg = ('Contig0', 24391, 24908)\n",
"reg = ('Contig0', 7193, 7952) # has merge\n",
"#reg = ('Contig0', 13327, 13845) # has indels\n",
"reg = ('Contig0', 76480, 77015) # has indels\n",
"\n",
"reg = ('Contig0', 346131, 346193) # valueerror\n",
"\n",
"samfile = AlignmentFile(\n",
" os.path.join(data.dirs.refmapping, \n",
" \"{}-mapped-sorted.bam\".format(sample.name)),\n",
" 'rb')\n",
"ref = get_ref_region(data.paramsdict[\"reference_sequence\"], *reg)\n",
"reads = list(samfile.fetch(*reg))"
]
},
{
"cell_type": "code",
"execution_count": 519,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def cigared(sequence, cigartups):\n",
" start = 0\n",
" seq = \"\"\n",
" for tup in cigartups:\n",
" flag, add = tup\n",
" if flag is 0:\n",
" seq += sequence[start:start + add]\n",
" if flag is 1:\n",
" pass\n",
" if flag is 2:\n",
" seq += \"-\" * add\n",
" start -= add\n",
" if flag is 4:\n",
" pass\n",
" start += add\n",
" return seq \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 456,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CATGTTCAGTGTAGTATGAAGCGGGGTAGCACTTTCCACTGCAGAAGTAGGTCTCATTGTGTGCGTAGTGC [(4, 15), (0, 25), (1, 3), (0, 16), (1, 1), (0, 5), (1, 1), (0, 5)]\n",
"(4, 15)\n",
"15 15 \n",
"(0, 25)\n",
"40 40 ATGAAGCGGGGTAGCACTTTCCACT\n",
"(1, 3)\n",
"43 43 ATGAAGCGGGGTAGCACTTTCCACT\n",
"(0, 16)\n",
"59 59 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTG\n",
"(1, 1)\n",
"60 60 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTG\n",
"(0, 5)\n",
"65 65 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTGGTGCG\n",
"(1, 1)\n",
"66 66 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTGGTGCG\n",
"(0, 5)\n",
"71 71 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTGGTGCGAGTGC\n",
"ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTGGTGCGAGTGC\n"
]
}
],
"source": [
"# match paired reads together in a dictionary\n",
"rdict = {}\n",
"for read in reads:\n",
" if read.qname not in rdict:\n",
" rdict[read.qname] = [read]\n",
" else:\n",
" rdict[read.qname].append(read)\n",
"\n",
"# sort keys by derep number\n",
"keys = sorted(\n",
" rdict.keys(),\n",
" key=lambda x: int(x.split(\"=\")[-1]), reverse=True)\n",
"\n",
"# build the cluster based on map positions, orientation, cigar\n",
"for key in keys:\n",
" r1, r2 = rdict[key]\n",
" if r1 and r2:\n",
" print(r1.seq, r1.cigar)# r1.pos, r1.aend, r1.rlen, r1.aligned_pairs)\n",
" print(cigared(r1.seq, r1.cigar))"
]
},
{
"cell_type": "code",
"execution_count": 310,
"metadata": {},
"outputs": [],
"source": [
"out = open(\"test2.txt\", 'w')\n",
"# build the cluster based on map positions, orientation, cigar\n",
"for key in keys:\n",
" r1, r2 = rdict[key]\n",
" if r1 and r2:\n",
" \n",
" # empty arrays\n",
" aref = np.array(list(ref[1]))\n",
" arr1 = np.zeros(aref.size, dtype=\"U1\")\n",
" arr2 = np.zeros(aref.size, dtype=\"U1\")\n",
" arr1.fill(\"-\")\n",
" arr2.fill(\"-\")\n",
"\n",
" # how far ahead of the start does this read begin\n",
" start = r1.reference_start - reg[1] \n",
" seq = cigared(r1.seq, r1.cigar)\n",
" arr1[start:start+len(seq)] = list(seq)\n",
" \n",
" seq = cigared(r2.seq, r2.cigar)\n",
" start = r2.reference_start - reg[1] \n",
" arr2[start:start+len(seq)] = list(seq)\n",
" #print(\"\".join(arr1), file=out)\n",
" #print(\"\".join(arr2), file=out)\n",
" \n",
" arr3 = join_arrays(arr1, arr2)\n",
" print(\"\".join(arr3), file=out)\n",
" \n",
"out.close()"
]
},
{
"cell_type": "code",
"execution_count": 307,
"metadata": {},
"outputs": [],
"source": [
"import numba\n",
"\n",
"#numba.jit(nopython=True)\n",
"def join_arrays(arr1, arr2):\n",
" arr3 = np.zeros(arr1.size, dtype=\"U1\")\n",
" for i in range(arr1.size):\n",
" \n",
" if arr1[i] == arr2[i]:\n",
" arr3[i] = arr1[i]\n",
" \n",
" elif arr1[i] == \"N\":\n",
" if arr2[i] == \"-\":\n",
" arr3[i] = \"N\"\n",
" else:\n",
" arr3[i] = arr2[i]\n",
" \n",
" elif arr2[i] == \"N\":\n",
" if arr1[i] == \"-\":\n",
" arr3[i] = \"N\"\n",
" else:\n",
" arr3[i] = arr1[i]\n",
" \n",
" elif arr1[i] == \"-\":\n",
" if arr2[i] == \"N\":\n",
" arr3[i] = \"N\"\n",
" else:\n",
" arr3[i] = arr2[i]\n",
" \n",
" elif arr2[i] == \"-\":\n",
" if arr1[i] == \"N\":\n",
" arr3[i] = \"N\"\n",
" else:\n",
" arr3[i] = arr1[i]\n",
" \n",
" else:\n",
" arr3[i] = \"N\"\n",
" return arr3\n",
" \n",
"\n",
"a1 = np.array(list(\"AGAGAG-NN----\"))\n",
"a2 = np.array(list(\"----ACTTNNTTT\"))\n",
"de = np.array(list(\"AGAGANTTNNTTT\")) \n",
"\n",
"a1 = np.array(list(\"AGAGAG-NN-------\"))\n",
"a2 = np.array(list(\"----ACTTNNTTT---\"))\n",
"de = np.array(list(\"AGAGANTTNNTTT---\"))"
]
},
{
"cell_type": "code",
"execution_count": 308,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['A', 'G', 'A', 'G', 'A', 'N', 'T', 'T', 'N', 'N', 'T', 'T', 'T',\n",
" '-', '-', '-'], dtype=' r2.blocks[0][0]:\n",
" rx = r2\n",
" r2 = r1\n",
" r1 = rx\n",
"\n",
"# get arrs\n",
"aref = np.array(list(ref[1]))\n",
"arr1 = np.zeros(aref.size, dtype=\"U1\")\n",
"arr2 = np.zeros(aref.size, dtype=\"U1\")\n",
"\n",
"# fill arr1\n",
"seq1 = cigared(r1.seq, r1.cigar)\n",
"arr1[]"
]
},
{
"cell_type": "code",
"execution_count": 197,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'reference'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mref\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_ref_region\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparamsdict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"reference\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mreg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcigar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# which is forward and reverse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 'reference'"
]
}
],
"source": [
"ref = get_ref_region(data.paramsdict[\"reference\"], *reg)\n",
"\n",
"def cigar(r1, r2, reg):\n",
" \n",
" # which is forward and reverse\n",
" if r1.blocks[0][0] > r2.blocks[0][0]:\n",
" rx = r2\n",
" r2 = r1\n",
" r1 = rx\n",
" \n",
" # get arrs\n",
" aref = np.array(list(ref[1]))\n",
" arr1 = np.zeros(aref.size, dtype=\"U1\")\n",
" arr2 = np.zeros(aref.size, dtype=\"U1\")\n",
" \n",
" # fill arr1\n",
" \n",
" \n",
" # do they overlap?\n",
" overlap = False\n",
" if max(r1.blocks[0]) > min(r2.blocks[0]):\n",
" overlap = True\n",
" #osegment = r1.blocks[0][1] - r2.blocks[0][0]\n",
" #oseqs = r1.seq[-osegment:], r2.seq[:osegment]\n",
" #print(oseqs)\n",
" \n",
" # modify for cigar\n",
" for edit in r1.cigar:\n",
" r1.seq\n",
" \n",
" \n",
" \n",
" before = \"-\" * (r1.pos - reg[1])\n",
" if r1.is_reverse:\n",
" read = before + revcomp(r1.seq)\n",
" else:\n",
" read = before + r1.seq\n",
" \n",
" midns = \"-\" * (r2.pos - r1.aend)\n",
" read += midns\n",
" \n",
" if r2.is_reverse:\n",
" read += r2.seq\n",
" else:\n",
" read += revcomp(r2.seq)\n",
"\n",
" after = \"-\" * (reg[2] - r2.aend)\n",
" read += after\n",
" return read\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 350,
"metadata": {},
"outputs": [],
"source": [
"def get_ref_region(reference, contig, rstart, rend):\n",
" cmd = [\n",
" ip.bins.samtools, 'faidx', \n",
" reference, \n",
" \"{}:{}-{}\".format(contig, rstart+1, rend)]\n",
" stdout, err = sps.Popen(cmd, stdout=sps.PIPE).communicate()\n",
" name, seq = stdout.decode().split(\"\\n\", 1)\n",
" listseq = [name, seq.replace(\"\\n\", \"\")]\n",
" return listseq"
]
},
{
"cell_type": "code",
"execution_count": 250,
"metadata": {},
"outputs": [],
"source": [
"ireg = samfile.fetch(*reg)\n",
"rdict = {}fo\n",
"for read in ireg:\n",
" if read.qname not in rdict:\n",
" rdict[read.qname] = [read]\n",
" else:\n",
" rdict[read.qname].append(read)\n",
"\n",
"clust = dict_to_clust(rdict)"
]
},
{
"cell_type": "code",
"execution_count": 251,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['AATTTATTATGAAACAAAAGGCAAAAAACTGTTATGTACATAGTTTAGTCCTATTGAGTGTCTACTCAGCGCTnnnnCGCTCACTGCTCCGCAGTTCAGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG',\n",
" 'AATTTAGACCTATTCGGTGTCTACTCAGCGCTTCTTGGCTTGTCTTTTGTATTTATTTAAnnnnGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG']"
]
},
"execution_count": 251,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clust"
]
},
{
"cell_type": "code",
"execution_count": 252,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['>Contig0:24392-24679',\n",
" 'AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTGTATTCATTAAATGGAGCATCTCTTGTCACTGTCCAGCAATAGTCTGCAAGCATTGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCGCCGTGCTCGTCGCTCACTGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG']"
]
},
"execution_count": 252,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ref = get_ref_region(*reg)\n",
"ref"
]
},
{
"cell_type": "code",
"execution_count": 255,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGC\n",
"AATTTAGACCTATTCGGTGTCTACTCAGCGCTTCTTGGCTTGTCTTTTGTATTTATTTAA\n"
]
}
],
"source": [
"print(ref[1][:80])\n",
"print(r1.seq)"
]
},
{
"cell_type": "code",
"execution_count": 272,
"metadata": {},
"outputs": [],
"source": [
"# how for ahead of reference is r1 start\n",
"ahead = -1 * (reg[1] - (r1.pos - 1))"
]
},
{
"cell_type": "code",
"execution_count": 274,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTG\n",
"---------------------------------------- AATTTAGACCTATTCGGTGTCTACTCAGCGCTTCTTGGCTTGTCTTTTGTATTTATTTAA\n"
]
}
],
"source": [
"print(ref[1][:90])\n",
"print(\"-\"*ahead, r1.seq)"
]
},
{
"cell_type": "code",
"execution_count": 344,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"r AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTG\n",
"h AATTTATTATGAAACAAAAGGCAAAAAACTGTTATGTACATAGTTTAGTCCTATTGAGTGTCTACTCAGCGCT\n",
"h -----------------------------------------AATTTAGACCTATTCGGTGTCTACTCAGCGCTTCTTGGCTTGTCTTTTG\n",
"r AATCGCTCGCCGTGCTCGTCGCTCACTGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG\n",
"24679 24608 71\n",
"h -----------------------------------------------------------------------\n",
"71M\n",
"24679 24616 63\n",
"h ---------------------------------------------------------------\n",
"63M\n"
]
}
],
"source": [
"\n",
"print('r', ref[1][:90])\n",
"\n",
"for key in rdict:\n",
" r1, r2 = rdict[key]\n",
" \n",
" ahead = -1 * (reg[1] - (r1.pos))\n",
" print('h', (\"-\"*ahead + r1.seq)[:90])\n",
" #print(r1.cigar)\n",
" \n",
"print('r', ref[1][-90:])\n",
"\n",
"for key in rdict:\n",
" r1, r2 = rdict[key]\n",
" \n",
" ahead = (reg[2] - (r2.pos))\n",
" \n",
" print(reg[2], r2.pos, ahead)\n",
" print('h', (\"-\"*ahead))\n",
" #print(r1.cigar)\n",
"\n",
" #print(r2.seq)\n",
" #print(r2.get_blocks())\n",
" #print(r1.seq)\n",
" #print(r1.get_blocks()[0])\n",
" #print(r1.aend, r2.get_blocks()[0][0])\n",
" #print(r1.cigar)\n",
" print(r2.cigarstring)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAAAGGTnnnnAGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG\n",
"AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAGAGGTnnnnGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG\n",
"AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAAAGGnnnnAGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG\n"
]
}
],
"source": [
"for read in clust:\n",
" print(read)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "not enough values to unpack (expected 2, got 1)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mrdict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mqname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0mclust\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict_to_clust\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m\u001b[0m in \u001b[0;36mdict_to_clust\u001b[0;34m(rdict)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mclust\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mpair\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrdict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mr1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpair\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mposs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_reference_positions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mr2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_reference_positions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mseedstart\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseedend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mposs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mposs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: not enough values to unpack (expected 2, got 1)"
]
}
],
"source": [
"# access reads from bam file using pysam\n",
"samfile = AlignmentFile(\n",
" os.path.join(data.dirs.refmapping, \n",
" \"{}-mapped-sorted.bam\".format(sample.name)),\n",
" 'rb')\n",
"\n",
"# iterate over all mapped regions\n",
"clusters = []\n",
"for reg in regions:\n",
" ireg = samfile.fetch(*reg)\n",
"\n",
" # match paired reads by read names for all reads in each cluster\n",
" rdict = {}\n",
" for read in ireg:\n",
" if read.qname not in rdict:\n",
" rdict[read.qname] = [read]\n",
" else:\n",
" rdict[read.qname].append(read)\n",
"\n",
" clust = dict_to_clust(rdict)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'1c64525c1f5f58dc1f713b0e5e1d0941;size=1': []}"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rdict"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"from ipyrad.assemble.util import revcomp"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"def build_ref_clusters_from_cigars(data, sample):\n",
" \n",
" # get regions from bedtools overlaps\n",
" regions = bedtools_merge(data, sample).strip().split(\"\\n\")\n",
" regions = (i.split(\"\\t\") for i in regions)\n",
" regions = ((i, int(j), int(k)) for (i, j, k) in regions)\n",
"\n",
" # access reads from bam file using pysam\n",
" samfile = AlignmentFile(\n",
" os.path.join(data.dirs.refmapping, \n",
" \"{}-mapped-sorted.bam\".format(sample.name)),\n",
" 'rb')\n",
" \n",
" # iterate over all mapped regions\n",
" clusters = []\n",
" for reg in regions:\n",
" ireg = samfile.fetch(*reg)\n",
" \n",
" # match paired reads by read names for all reads in each cluster\n",
" rdict = {}\n",
" for read in ireg:\n",
" if read.qname not in rdict:\n",
" rdict[read.qname] = [read]\n",
" else:\n",
" rdict[read.qname].append(read)\n",
"\n",
" return dict_to_clust(rdict)"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG',\n",
" 'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG',\n",
" 'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG',\n",
" 'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG',\n",
" 'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG']"
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"seq = build_ref_clusters_from_cigars(data, sample)\n",
"seq"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"def dict_to_clust(rdict):\n",
" clust = []\n",
" for pair in rdict.values():\n",
" r1, r2 = pair\n",
" poss = r1.get_reference_positions() + r2.get_reference_positions()\n",
" seedstart, seedend = min(poss), max(poss)\n",
"\n",
" reads_overlap = False\n",
" #print(r1.cigartuples, r1.cigarstring, r1.cigar)\n",
"\n",
" if r1.is_reverse:\n",
" if r2.aend > r1.get_blocks()[0][0]:\n",
" reads_overlap = True\n",
" seq = r2.seq + 'nnnn' + revcomp(r1.seq)\n",
" else:\n",
" seq = r2.seq + 'nnnn' + r2.seq\n",
"\n",
" else:\n",
" if r1.aend > r2.get_blocks()[0][0]:\n",
" reads_overlap = True\n",
" seq = r1.seq + 'nnnn' + revcomp(r2.seq)\n",
" else:\n",
" seq = r1.seq + 'nnnn' + r2.seq\n",
" clust.append(seq)\n",
" return clust"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"# build clusters for aligning with muscle from the sorted bam file\n",
"samfile = AlignmentFile(\n",
" os.path.join(data.dirs.refmapping, \n",
" \"{}-mapped-sorted.bam\".format(sample.name)),\n",
" 'rb')\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5009\n",
"TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGC\n",
"5009\n",
"TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCCC\n",
"5009\n",
"TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGAACCAGCTGCCCCC\n",
"5009\n",
"TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGCCCCGACCCAGCTGCCCGC\n",
"5009\n",
"TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGC\n",
"5009\n",
"TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG\n",
"5009\n",
"TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATAGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG\n",
"5009\n",
"TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATAGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG\n",
"5009\n",
"TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG\n",
"5009\n",
"TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCTGCTGCAGTTAACTGGCCTCTTAACCCCG\n"
]
}
],
"source": [
"chrom, rstart, rend = regions[0].split()\n",
"reg = samfile.fetch(chrom, int(rstart), int(rend))\n",
"\n",
"for read in reg:\n",
" print(rstart)\n",
" print(read.seq)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"ename": "IndexError",
"evalue": "list index out of range",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;31m# file precedence\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mnonm1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0medits1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnonmerged1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mnonm2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0medits2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnonmerged2\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mIndexError\u001b[0m: list index out of range"
]
}
],
"source": [
"nonmerged1 = os.path.join(\n",
" data.tmpdir, \n",
" \"{}_nonmerged_R1_.fastq\".format(sample.name))\n",
"nonmerged2 = os.path.join(\n",
" data.tmpdir, \n",
" \"{}_nonmerged_R2_.fastq\".format(sample.name))\n",
"edits1 = os.path.join(\n",
" data.dirs.edits,\n",
" \"{}.trimmed_R1_.fq.gz\".format(sample.name))\n",
"edits2 = os.path.join(\n",
" data.dirs.edits, \n",
" \"{}.trimmed_R2_.fq.gz\".format(sample.name))\n",
"\n",
"# file precedence\n",
"nonm1 = [i for i in (edits1, nonmerged1) if os.path.exists(i)][-1]\n",
"nonm2 = [i for i in (edits2, nonmerged2) if os.path.exists(i)][-1]\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/home/deren/Documents/ipyrad/tests/4-refpairtest_edits/3L_0.trimmed_R1_.fq.gz'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"edits1"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"infiles = [\n",
" os.path.join(data.dirs.edits, \"{}.trimmed_R1_.fastq.gz\".format(sample.name)),\n",
" os.path.join(data.dirs.edits, \"{}_R1_concatedit.fq.gz\".format(sample.name)),\n",
" os.path.join(data.tmpdir, \"{}_merged.fastq\".format(sample.name)),\n",
" os.path.join(data.tmpdir, \"{}_declone.fastq\".format(sample.name)),\n",
"]\n",
"infiles = [i for i in infiles if os.path.exists(i)]\n",
"infile = infiles[-1]\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"split_endtoend_reads(data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"with open(inp, 'r') as infile:\n",
" duo = izip(*[iter(infile)] * 2)\n",
" \n",
" idx = 0\n",
" while 1:\n",
" try:\n",
" itera = next(duo)\n",
" except StopIteration:\n",
" break\n",
" \n",
" r1, r2 = itera[1].split(\"nnnn\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def split_endtoend_reads(data, sample):\n",
" \"\"\"\n",
" Takes R1nnnnR2 derep reads from paired data and splits it back into \n",
" separate R1 and R2 parts for read mapping. \n",
" \"\"\"\n",
"\n",
" inp = os.path.join(data.tmpdir, \"{}_derep.fastq\".format(sample.name))\n",
" out1 = os.path.join(data.tmpdir, \"{}_R1-tmp.fastq\".format(sample.name))\n",
" out2 = os.path.join(data.tmpdir, \"{}_R2-tmp.fastq\".format(sample.name))\n",
"\n",
" splitderep1 = open(out1, 'w')\n",
" splitderep2 = open(out2, 'w')\n",
"\n",
" with open(inp, 'r') as infile: \n",
" # Read in the infile two lines at a time: (seqname, sequence)\n",
" duo = izip(*[iter(infile)] * 2)\n",
"\n",
" ## lists for storing results until ready to write\n",
" split1s = []\n",
" split2s = []\n",
"\n",
" ## iterate over input splitting, saving, and writing.\n",
" idx = 0\n",
" while 1:\n",
" try:\n",
" itera = next(duo)\n",
" except StopIteration:\n",
" break\n",
" ## split the duo into separate parts and inc counter\n",
" part1, part2 = itera[1].split(\"nnnn\")\n",
" idx += 1\n",
"\n",
" ## R1 needs a newline, but R2 inherits it from the original file \n",
" ## store parts in lists until ready to write\n",
" split1s.append(\"{}{}\\n\".format(itera[0], part1))\n",
" split2s.append(\"{}{}\".format(itera[0], part2))\n",
"\n",
" ## if large enough then write to file\n",
" if not idx % 10000:\n",
" splitderep1.write(\"\".join(split1s))\n",
" splitderep2.write(\"\".join(split2s))\n",
" split1s = []\n",
" split2s = [] \n",
"\n",
" ## write final chunk if there is any\n",
" if any(split1s):\n",
" splitderep1.write(\"\".join(split1s))\n",
" splitderep2.write(\"\".join(split2s))\n",
"\n",
" ## close handles\n",
" splitderep1.close()\n",
" splitderep2.close()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'mergepairs' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmergepairs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'mergepairs' is not defined"
]
}
],
"source": [
"merge_pairs(data, sample, 1, 1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('/home/deren/Documents/ipyrad/tests/4-refpairtest_edits/1A_0.trimmed_R1_.fastq.gz',\n",
" '/home/deren/Documents/ipyrad/tests/4-refpairtest_edits/1A_0.trimmed_R2_.fastq.gz')]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s3.data.samples[\"1A_0\"].concatfiles"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[####################] 100% 0:00:00 | concatenating | s3 |\n",
"[####################] 100% 0:00:04 | mapping | s3 |\n"
]
}
],
"source": [
"s3.run()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def bedtools_merge(data, sample):\n",
" \"\"\" \n",
" Get all contiguous genomic regions with one or more overlapping\n",
" reads. This is the shell command we'll eventually run\n",
"\n",
" bedtools bamtobed -i 1A_0.sorted.bam | bedtools merge [-d 100]\n",
" -i : specifies the input file to bed'ize\n",
" -d : For PE set max distance between reads\n",
" \"\"\"\n",
" LOGGER.info(\"Entering bedtools_merge: %s\", sample.name)\n",
" mappedreads = os.path.join(data.dirs.refmapping, \n",
" sample.name + \"-mapped-sorted.bam\")\n",
"\n",
" ## command to call `bedtools bamtobed`, and pipe output to stdout\n",
" ## Usage: bedtools bamtobed [OPTIONS] -i \n",
" ## Usage: bedtools merge [OPTIONS] -i \n",
" cmd1 = [ip.bins.bedtools, \"bamtobed\", \"-i\", mappedreads]\n",
" cmd2 = [ip.bins.bedtools, \"merge\", \"-i\", \"-\"]\n",
"\n",
" ## If PE the -d flag to tell bedtools how far apart to allow mate pairs.\n",
" ## If SE the -d flag is negative, specifying that SE reads need to\n",
" ## overlap by at least a specific number of bp. This prevents the\n",
" ## stairstep syndrome when a + and - read are both extending from\n",
" ## the same cutsite. Passing a negative number to `merge -d` gets this done.\n",
" if 'pair' in data.paramsdict[\"datatype\"]:\n",
" check_insert_size(data, sample)\n",
" #cmd2.insert(2, str(data._hackersonly[\"max_inner_mate_distance\"]))\n",
" cmd2.insert(2, str(data._hackersonly[\"max_inner_mate_distance\"]))\n",
" cmd2.insert(2, \"-d\")\n",
" else:\n",
" cmd2.insert(2, str(-1 * data._hackersonly[\"min_SE_refmap_overlap\"]))\n",
" cmd2.insert(2, \"-d\")\n",
"\n",
" ## pipe output from bamtobed into merge\n",
" LOGGER.info(\"stdv: bedtools merge cmds: %s %s\", cmd1, cmd2)\n",
" proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)\n",
" proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc1.stdout)\n",
" result = proc2.communicate()[0].decode()\n",
" proc1.stdout.close()\n",
"\n",
" ## check for errors and do cleanup\n",
" if proc2.returncode:\n",
" raise IPyradWarningExit(\"error in %s: %s\", cmd2, result)\n",
"\n",
" ## Write the bedfile out, because it's useful sometimes.\n",
" if os.path.exists(ip.__debugflag__):\n",
" with open(os.path.join(data.dirs.refmapping, sample.name + \".bed\"), 'w') as outfile:\n",
" outfile.write(result)\n",
"\n",
" ## Report the number of regions we're returning\n",
" nregions = len(result.strip().split(\"\\n\"))\n",
" LOGGER.info(\"bedtools_merge: Got # regions: %s\", nregions)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bedtools_merge"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def check_insert_size(data, sample):\n",
" \"\"\"\n",
" check mean insert size for this sample and update \n",
" hackersonly.max_inner_mate_distance if need be. This value controls how \n",
" far apart mate pairs can be to still be considered for bedtools merging \n",
" downstream.\n",
" \"\"\"\n",
"\n",
" ## pipe stats output to grep\n",
" cmd1 = [\n",
" ip.bins.samtools, \n",
" \"stats\", \n",
" os.path.join(\n",
" data.dirs.refmapping, \"{}-mapped-sorted.bam\".format(sample.name)),\n",
" ]\n",
" cmd2 = [\"grep\", \"SN\"]\n",
" proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)\n",
" proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc1.stdout)\n",
" res = proc2.communicate()[0].decode()\n",
" if proc2.returncode:\n",
" raise IPyradWarningExit(\"error in %s: %s\", cmd2, res)\n",
" \n",
" ## starting vals\n",
" avg_insert = 0\n",
" stdv_insert = 0\n",
" avg_len = 0\n",
"\n",
" ## iterate over results\n",
" for line in res.split(\"\\n\"):\n",
" if \"insert size average\" in line:\n",
" avg_insert = float(line.split(\":\")[-1].strip())\n",
"\n",
" elif \"insert size standard deviation\" in line:\n",
" ## hack to fix sim data when stdv is 0.0. Shouldn't\n",
" ## impact real data bcz stdv gets rounded up below\n",
" stdv_insert = float(line.split(\":\")[-1].strip()) + 0.1\n",
" \n",
" elif \"average length\" in line:\n",
" avg_len = float(line.split(\":\")[-1].strip())\n",
"\n",
" LOGGER.debug(\"avg {} stdv {} avg_len {}\"\n",
" .format(avg_insert, stdv_insert, avg_len))\n",
"\n",
" ## If all values return successfully set the max inner mate distance.\n",
" ## This is tricky. avg_insert is the average length of R1+R2+inner mate\n",
" ## distance. avg_len is the average length of a read. If there are lots\n",
" ## of reads that overlap then avg_insert will be close to but bigger than\n",
" ## avg_len. We are looking for the right value for `bedtools merge -d`\n",
" ## which wants to know the max distance between reads. \n",
" if all([avg_insert, stdv_insert, avg_len]):\n",
" ## If 2 * the average length of a read is less than the average\n",
" ## insert size then most reads DO NOT overlap\n",
" if stdv_insert < 5:\n",
" stdv_insert = 5.\n",
" if (2 * avg_len) < avg_insert:\n",
" hack = avg_insert + (3 * np.math.ceil(stdv_insert)) - (2 * avg_len)\n",
"\n",
" ## If it is > than the average insert size then most reads DO\n",
" ## overlap, so we have to calculate inner mate distance a little \n",
" ## differently.\n",
" else:\n",
" hack = (avg_insert - avg_len) + (3 * np.math.ceil(stdv_insert))\n",
" \n",
"\n",
" ## set the hackerdict value\n",
" LOGGER.info(\"stdv: hacked insert size is %s\", hack)\n",
" data._hackersonly[\"max_inner_mate_distance\"] = int(np.math.ceil(hack))\n",
"\n",
" else:\n",
" ## If something fsck then set a relatively conservative distance\n",
" data._hackersonly[\"max_inner_mate_distance\"] = 300\n",
" LOGGER.debug(\"inner mate distance for {} - {}\".format(sample.name,\\\n",
" data._hackersonly[\"max_inner_mate_distance\"]))\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"#from ipyrad.assemble.refmap import *\n",
"data = s3.data\n",
"samples = list(data.samples.values())\n",
"sample = samples[0]\n",
"regions = bedtools_merge(data, sample).strip().split(\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"nregions = len(regions)\n",
"chunksize = (nregions / 10) + (nregions % 10)\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"from pysam import AlignmentFile\n",
"sample.files.mapped_reads = os.path.join(\n",
" data.dirs.refmapping, sample.name + \"-mapped-sorted.bam\")\n",
"samfile = AlignmentFile(sample.files.mapped_reads, 'rb')\n",
"#\"./tortas_refmapping/PZ70-mapped-sorted.bam\", \"rb\")"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'MT\\t10109\\t10200'"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"regions[1]"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] could not open alignment file `ex1.sam`: No such file or directory",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msamfile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpysam\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mAlignmentFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"ex1.sam\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32mpysam/libcalignmentfile.pyx\u001b[0m in \u001b[0;36mpysam.libcalignmentfile.AlignmentFile.__cinit__\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpysam/libcalignmentfile.pyx\u001b[0m in \u001b[0;36mpysam.libcalignmentfile.AlignmentFile._open\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] could not open alignment file `ex1.sam`: No such file or directory"
]
}
],
"source": [
"samfile = pysam.AlignmentFile(\"ex1.sam\", \"r\")"
]
},
{
"cell_type": "code",
"execution_count": 262,
"metadata": {},
"outputs": [],
"source": [
"a = pysam.AlignmentFile(\"./3-refsetest_refmapping/1A_0.sam\")"
]
},
{
"cell_type": "code",
"execution_count": 287,
"metadata": {},
"outputs": [],
"source": [
"nthreads = 2\n",
"cmd1 = [\n",
" ip.bins.bwa, \"mem\",\n",
" \"-t\", str(max(1, nthreads)),\n",
" \"-M\",\n",
" data.paramsdict['reference_sequence'], \n",
" sample.concatfiles[0][0],\n",
" sample.concatfiles[0][1] or \"\",\n",
" ] \n",
"cmd1 = [i for i in cmd1 if i]\n",
"\n",
"# Insert optional flags for bwa\n",
"bwa_args = data._hackersonly[\"bwa_args\"].split()\n",
"bwa_args.reverse()\n",
"for arg in bwa_args:\n",
" cmd1.insert(2, arg)\n",
"\n",
"with open(sample.files.sam, 'wb') as outfile:\n",
" proc1 = sps.Popen(cmd1, stderr=None, stdout=outfile)\n",
" error1 = proc1.communicate()[0]\n",
" if proc1.returncode:\n",
" raise IPyradError(error1)"
]
},
{
"cell_type": "code",
"execution_count": 288,
"metadata": {},
"outputs": [],
"source": [
"def bwa_map(data, sample, nthreads, force):\n",
" \"\"\" \n",
" Map reads to reference sequence. This reads in the fasta files\n",
" (samples.files.edits), and maps each read to the reference. Unmapped reads \n",
" are dropped right back in the de novo pipeline. \n",
" Mapped reads end up in a sam file.\n",
" \"\"\"\n",
"\n",
" sample.files.sam = os.path.join(\n",
" data.dirs.refmapping, \n",
" \"{}.sam\".format(sample.name))\n",
"\n",
" sample.files.mapped_reads = os.path.join(\n",
" data.dirs.refmapping,\n",
" \"{}-mapped-sorted.bam\".format(sample.name))\n",
"\n",
" sample.files.unmapped_bam = os.path.join(\n",
" data.dirs.refmapping,\n",
" \"{}-unmapped.bam\".format(sample.name))\n",
"\n",
" sample.files.unmapped_reads = os.path.join(\n",
" data.dirs.refmapping,\n",
" \"{}-unmapped.fastq\".format(sample.name))\n",
"\n",
"\n",
" # (cmd1) bwa mem [OPTIONS] [] \n",
" # -t # : Number of threads\n",
" # -M : Mark split alignments as secondary.\n",
"\n",
" # (cmd2) samtools view [options] || [region ...] \n",
" # -b = write to .bam\n",
" # -q = Only keep reads with mapq score >= 30 (seems to be pretty standard)\n",
" # -F = Select all reads that DON'T have these flags. \n",
" # 0x4 (segment unmapped)\n",
" # 0x100 (Secondary alignment)\n",
" # 0x800 (supplementary alignment)\n",
" # -U = Write out all reads that don't pass the -F filter \n",
" # (all unmapped reads go to this file).\n",
"\n",
" # (cmd3) samtools sort [options...] [in.bam]\n",
" # -T = Temporary file name, this is required by samtools, ignore it\n",
" # Here we hack it to be samhandle.tmp cuz samtools cleans it up\n",
" # -O = Output file format, in this case bam\n",
" # -o = Output file name\n",
"\n",
" # (cmd5) samtools bam2fq -v 45 [in.bam]\n",
" # -v45 set the default qscore arbirtrarily high\n",
" #\n",
" cmd1 = [\n",
" ip.bins.bwa, \"mem\",\n",
" \"-t\", str(max(1, nthreads)),\n",
" \"-M\",\n",
" data.paramsdict['reference_sequence'], \n",
" sample.concatfiles[0][0],\n",
" sample.concatfiles[0][1] or \"\",\n",
" ] \n",
" cmd1 = [i for i in cmd1 if i]\n",
"\n",
" # Insert optional flags for bwa\n",
" bwa_args = data._hackersonly[\"bwa_args\"].split()\n",
" bwa_args.reverse()\n",
" for arg in bwa_args:\n",
" cmd1.insert(2, arg)\n",
"\n",
" with open(sample.files.sam, 'wb') as outfile:\n",
" proc1 = sps.Popen(cmd1, stderr=None, stdout=outfile)\n",
" error1 = proc1.communicate()[0]\n",
" if proc1.returncode:\n",
" raise IPyradError(error1)\n",
"\n",
" # sends unmapped reads to a files and will PIPE mapped reads to cmd3\n",
" cmd2 = [\n",
" ip.bins.samtools, \"view\", \n",
" \"-b\", \n",
" \"-F\", \"0x904\",\n",
" \"-U\", sample.files.unmapped_bam,\n",
" sample.files.sam, \n",
" ]\n",
"\n",
" # this is gonna catch mapped bam output from cmd2 and write to file\n",
" cmd3 = [\n",
" ip.bins.samtools, \"sort\", \n",
" \"-T\", os.path.join(data.dirs.refmapping, sample.name + \".sam.tmp\"),\n",
" \"-O\", \"bam\", \n",
" \"-o\", sample.files.mapped_reads]\n",
"\n",
" # Later we're gonna use samtools to grab out regions using 'view'\n",
" cmd4 = [ipyrad.bins.samtools, \"index\", sample.files.mapped_reads]\n",
"\n",
" # convert unmapped reads to fastq\n",
" cmd5 = [\n",
" ip.bins.samtools, \"bam2fq\",\n",
" \"-v 45\",\n",
" sample.files.unmapped_bam, \n",
" ]\n",
"\n",
" # Insert additional arguments for paired data to the commands.\n",
" # We assume Illumina paired end reads for the orientation \n",
" # of mate pairs (orientation: ---> <----). \n",
" if 'pair' in data.paramsdict[\"datatype\"]:\n",
" # add samtools filter for only keep if both pairs hit\n",
" # 0x1 - Read is paired\n",
" # 0x2 - Each read properly aligned\n",
" cmd2.insert(2, \"0x3\")\n",
" cmd2.insert(2, \"-f\")\n",
"\n",
" # tell bam2fq that there are output files for each read pair\n",
" cmd5.insert(2, os.path.join(\n",
" data.dirs.edits, sample.name + \"-tmp-umap1.fastq\"))\n",
" cmd5.insert(2, \"-1\")\n",
" cmd5.insert(2, os.path.join(\n",
" data.dirs.edits, sample.name + \"-tmp-umap2.fastq\"))\n",
" cmd5.insert(2, \"-2\")\n",
" else:\n",
" cmd5.insert(2, sample.files.unmapped_reads)\n",
" cmd5.insert(2, \"-0\")\n",
"\n",
"\n",
" # cmd2 writes to sname.unmapped.bam and fills pipe with mapped BAM data\n",
" LOGGER.debug(\" \".join(cmd2))\n",
" proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)\n",
"\n",
" # cmd3 pulls mapped BAM from pipe and writes to sname.mapped-sorted.bam\n",
" LOGGER.debug(\" \".join(cmd3))\n",
" proc3 = sps.Popen(cmd3, \n",
" stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc2.stdout)\n",
" error3 = proc3.communicate()[0]\n",
" if proc3.returncode:\n",
" raise IPyradWarningExit(error3)\n",
" proc2.stdout.close()\n",
"\n",
" # cmd4 indexes the bam file \n",
" LOGGER.debug(\" \".join(cmd4))\n",
" proc4 = sps.Popen(cmd4, stderr=sps.STDOUT, stdout=sps.PIPE)\n",
" error4 = proc4.communicate()[0]\n",
" if proc4.returncode:\n",
" raise IPyradWarningExit(error4)\n",
" \n",
" # Running cmd5 writes to either edits/sname-refmap_derep.fastq for SE\n",
" # or it makes edits/sname-tmp-umap{12}.fastq for paired data, which \n",
" # will then need to be merged.\n",
" LOGGER.debug(\" \".join(cmd5))\n",
" proc5 = sps.Popen(cmd5, stderr=sps.STDOUT, stdout=sps.PIPE)\n",
" error5 = proc5.communicate()[0]\n",
" if proc5.returncode:\n",
" raise IPyradWarningExit(error5)"
]
},
{
"cell_type": "code",
"execution_count": 289,
"metadata": {},
"outputs": [],
"source": [
"bwa_map(data, sample, 2, 1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n"
]
},
{
"cell_type": "code",
"execution_count": 205,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(None, None)"
]
},
"execution_count": 205,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cmd1 = [\n",
" ip.bins.bwa, \"mem\",\n",
" \"-t\", str(max(1, 2)),\n",
" \"-M\",\n",
" data.paramsdict['reference_sequence'], \n",
" sample.concatfiles[0][0],\n",
" sample.concatfiles[0][1] or \"\",\n",
" ] \n",
"\n",
"proc1 = sps.Popen(cmd1)#, stderr=cmd1_stderr, stdout=cmd1_stdout)\n",
"proc1.communicate()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pysam.sort(\"-m\", \"1000000\", \"-o\", \"output.bam\", \"ex1.bam\")"
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {},
"outputs": [],
"source": [
"iterreg = samfile.fetch(\"MT\", 5009, 5100)\n",
"iterreg = samfile.fetch(\"MT\", 10109, 10200)\n",
"iterreg = samfile.fetch(\"MT\", 285510, 285600)"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TGCAGACCATAATGGCATGGTGCCACAGCTAGGGACCGATCTCTTTCTAAACTACATCCCCTAGGTTGAGACCCATGCGTGCTCTAATTGG\n"
]
},
{
"ename": "TypeError",
"evalue": "Argument 'other' has incorrect type (expected pysam.libcalignedsegment.AlignedSegment, got str)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mread\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterreg\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_reference_sequence\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m: Argument 'other' has incorrect type (expected pysam.libcalignedsegment.AlignedSegment, got str)"
]
}
],
"source": [
"for read in iterreg:\n",
" print(read.seq)\n",
" print(read.compare(read.get_reference_sequence()))"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"it = samfile.pileup(\"MT\", 5009, 5100)\n",
"pysam.Pileup."
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n",
"[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]\n"
]
}
],
"source": [
"for read in iterreg:\n",
" print(read.aligned_pairs)"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"rdict = {}\n",
"for read in iterreg:\n",
" if read.qname not in rdict:\n",
" rdict[read.qname] = read"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCCC'"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"read.seq\n"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(0, 91)]"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"read.cigar"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['lane1_locus0_1B_0_0', 'lane1_locus0_1B_0_1', 'lane1_locus0_1B_0_2', 'lane1_locus0_1B_0_3', 'lane1_locus0_1B_0_4', 'lane1_locus0_1B_0_5', 'lane1_locus0_1B_0_6', 'lane1_locus0_1B_0_7', 'lane1_locus0_1B_0_8', 'lane1_locus0_1B_0_9', 'lane1_locus0_1B_0_10', 'lane1_locus0_1B_0_11', 'lane1_locus0_1B_0_12', 'lane1_locus0_1B_0_13', 'lane1_locus0_1B_0_14', 'lane1_locus0_1B_0_15', 'lane1_locus0_1B_0_16', 'lane1_locus0_1B_0_17', 'lane1_locus0_1B_0_18', 'lane1_locus0_1B_0_19', 'lane1_locus0_1B_0_20'])"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted(\n",
" rdict.keys(),\n",
" key=lambda x: int(x.split(\"\"))"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"it = samfile.pileup('MT', 5009, 5100)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"ename": "IndexError",
"evalue": "list index out of range",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mchrom\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpos1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpos2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mregion\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mclust\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfetch_cluster_se\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msamfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchrom\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpos1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpos2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/refmap.py\u001b[0m in \u001b[0;36mfetch_cluster_se\u001b[0;34m(data, samfile, chrom, rstart, rend)\u001b[0m\n\u001b[1;32m 368\u001b[0m \u001b[0;31m## sort dict keys so highest derep is first ('seed')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 369\u001b[0m \u001b[0msfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";size=\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 370\u001b[0;31m \u001b[0mrkeys\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msorted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreverse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 371\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 372\u001b[0m \u001b[0;31m## get blocks from the seed for filtering, bail out if seed is not paired\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/refmap.py\u001b[0m in \u001b[0;36m\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 368\u001b[0m \u001b[0;31m## sort dict keys so highest derep is first ('seed')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 369\u001b[0;31m \u001b[0msfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";size=\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 370\u001b[0m \u001b[0mrkeys\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msorted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreverse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 371\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mIndexError\u001b[0m: list index out of range"
]
}
],
"source": [
"clusts = []\n",
"nclusts = 0\n",
"for region in regions:\n",
" chrom, pos1, pos2 = region.split()\n",
" \n",
" clust = fetch_cluster_se(data, samfile, chrom, int(pos1), int(pos2))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"def fetch_cluster_se(data, samfile, chrom, rstart, rend):\n",
" \"\"\"\n",
" Builds a single end cluster from the refmapped data.\n",
" \"\"\"\n",
"\n",
" ## If SE then we enforce the minimum overlap distance to avoid the\n",
" ## staircase syndrome of multiple reads overlapping just a little.\n",
" overlap_buffer = data._hackersonly[\"min_SE_refmap_overlap\"]\n",
"\n",
" ## the *_buff variables here are because we have to play patty\n",
" ## cake here with the rstart/rend vals because we want pysam to\n",
" ## enforce the buffer for SE, but we want the reference sequence\n",
" ## start and end positions to print correctly for downstream.\n",
" rstart_buff = rstart + overlap_buffer\n",
" rend_buff = rend - overlap_buffer\n",
"\n",
" ## Reads that map to only very short segements of the reference\n",
" ## sequence will return buffer end values that are before the\n",
" ## start values causing pysam to complain. Very short mappings.\n",
" if rstart_buff > rend_buff:\n",
" tmp = rstart_buff\n",
" rstart_buff = rend_buff\n",
" rend_buff = tmp\n",
" ## Buffering can't make start and end equal or pysam returns nothing.\n",
" if rstart_buff == rend_buff:\n",
" rend_buff += 1\n",
"\n",
" ## store pairs\n",
" rdict = {}\n",
" clust = []\n",
" iterreg = []\n",
"\n",
" iterreg = samfile.fetch(chrom, rstart_buff, rend_buff)\n",
"\n",
" ## use dict to match up read pairs\n",
" for read in iterreg:\n",
" if read.qname not in rdict:\n",
" rdict[read.qname] = read\n",
"\n",
" ## sort dict keys so highest derep is first ('seed')\n",
" sfunc = lambda x: int(x.split(\";size=\")[1].split(\";\")[0])\n",
" rkeys = sorted(rdict.keys(), key=sfunc, reverse=True)\n",
"\n",
" ## get blocks from the seed for filtering, bail out if seed is not paired\n",
" try:\n",
" read1 = rdict[rkeys[0]]\n",
" except ValueError:\n",
" LOGGER.error(\"Found bad cluster, skipping - key:{} rdict:{}\".format(rkeys[0], rdict))\n",
" return \"\"\n",
"\n",
" ## the starting blocks for the seed\n",
" poss = read1.get_reference_positions(full_length=True)\n",
" seed_r1start = min(poss)\n",
" seed_r1end = max(poss)\n",
"\n",
" ## store the seed -------------------------------------------\n",
" if read1.is_reverse:\n",
" seq = revcomp(read1.seq)\n",
" else:\n",
" seq = read1.seq\n",
"\n",
" ## store, could write orient but just + for now.\n",
" size = sfunc(rkeys[0])\n",
" clust.append(\">{}:{}:{};size={};*\\n{}\"\\\n",
" .format(chrom, seed_r1start, seed_r1end, size, seq))\n",
"\n",
" ## If there's only one hit in this region then rkeys will only have\n",
" ## one element and the call to `rkeys[1:]` will raise. Test for this.\n",
" if len(rkeys) > 1:\n",
" ## store the hits to the seed -------------------------------\n",
" for key in rkeys[1:]:\n",
" skip = False\n",
" try:\n",
" read1 = rdict[key]\n",
" except ValueError:\n",
" ## enter values that will make this read get skipped\n",
" read1 = rdict[key][0]\n",
" skip = True\n",
"\n",
" ## orient reads only if not skipping\n",
" if not skip:\n",
" poss = read1.get_reference_positions(full_length=True)\n",
" minpos = min(poss)\n",
" maxpos = max(poss)\n",
" ## store the seq\n",
" if read1.is_reverse:\n",
" seq = revcomp(read1.seq)\n",
" else:\n",
" seq = read1.seq\n",
" ## store, could write orient but just + for now.\n",
" size = sfunc(key)\n",
" clust.append(\">{}:{}:{};size={};+\\n{}\"\\\n",
" .format(chrom, minpos, maxpos, size, seq))\n",
" else:\n",
" ## seq is excluded, though, we could save it and return\n",
" ## it as a separate cluster that will be aligned separately.\n",
" pass\n",
"\n",
" return clust "
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"cmd1 = [\n",
" ip.bins.bwa, \"mem\",\n",
" \"-t\", str(max(1, nthreads)),\n",
" \"-M\",\n",
" data.paramsdict['reference_sequence'], \n",
" ] \n",
"cmd1 += [i for i in sample.concatfiles[0] if i]\n",
"\n",
"# Insert optional flags for bwa\n",
"bwa_args = data._hackersonly[\"bwa_args\"].split()\n",
"bwa_args.reverse()\n",
"for arg in bwa_args:\n",
" cmd1.insert(2, arg)\n",
"\n",
"cmd1_stdout_handle = os.path.join(\n",
" data.dirs.refmapping, sample.name + \".sam\")\n",
"cmd1_stdout = open(cmd1_stdout_handle, 'w')\n",
"cmd1_stderr = None\n",
"\n",
"proc1 = sps.Popen(cmd1, stderr=cmd1_stderr, stdout=cmd1_stdout)\n"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"error1 = proc1.communicate()[0]"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"cmd2 = [\n",
" ip.bins.samtools, \"view\", \n",
" \"-b\", \n",
" #\"-q\", \"30\",\n",
" \"-F\", \"0x904\",\n",
" \"-U\", os.path.join(\n",
" data.dirs.refmapping, sample.name + \"-unmapped.bam\"), \n",
" os.path.join(data.dirs.refmapping, sample.name + \".sam\")]\n"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)\n",
"#proc2.communicate()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(b'', None)"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"proc3 = sps.Popen(cmd3, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc2.stdout)\n",
"proc3.communicate()"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(b'', None)"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"proc4 = sps.Popen(cmd4, stderr=sps.STDOUT, stdout=sps.PIPE)\n",
"proc4.communicate()"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
" sample.files.unmapped_reads = os.path.join(\n",
" data.dirs.refmapping,\n",
" \"{}-unmapped.fastq\".format(sample.name))\n",
"\n",
" \n",
" cmd4 = [ip.bins.samtools, \"index\", sample.files.mapped_reads]\n",
"\n",
" # this is gonna read in the unmapped files, args are added below, \n",
" # and it will output fastq formatted unmapped reads for merging.\n",
" # -v 45 sets the default qscore arbitrarily high\n",
" cmd5 = [\n",
" ip.bins.samtools, \"bam2fq\",\n",
" \"-v 45\",\n",
" os.path.join(data.dirs.refmapping, sample.name + \"-unmapped.bam\")]\n",
"\n",
" # Insert additional arguments for paired data to the commands.\n",
" # We assume Illumina paired end reads for the orientation \n",
" # of mate pairs (orientation: ---> <----). \n",
" if 'pair' in data.paramsdict[\"datatype\"]:\n",
" # add samtools filter for only keep if both pairs hit\n",
" # 0x1 - Read is paired\n",
" # 0x2 - Each read properly aligned\n",
" cmd2.insert(2, \"0x3\")\n",
" cmd2.insert(2, \"-f\")\n",
"\n",
" # tell bam2fq that there are output files for each read pair\n",
" cmd5.insert(2, os.path.join(data.dirs.edits, sample.name + \"-tmp-umap1.fastq\"))\n",
" cmd5.insert(2, \"-1\")\n",
" cmd5.insert(2, os.path.join(data.dirs.edits, sample.name + \"-tmp-umap2.fastq\"))\n",
" cmd5.insert(2, \"-2\")\n",
" else:\n",
" cmd5.insert(2, sample.files.unmapped_reads)\n",
" cmd5.insert(2, \"-0\")"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/home/deren/Documents/ipyrad/bin/samtools-linux-x86_64',\n",
" 'bam2fq',\n",
" '-0',\n",
" '/home/deren/Documents/ipyrad/tests/3-refsetest_refmapping/3K_0-unmapped.fastq',\n",
" '-v 45',\n",
" '/home/deren/Documents/ipyrad/tests/3-refsetest_refmapping/3K_0-unmapped.bam']"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cmd5"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"proc5 = sps.Popen(cmd5, stderr=sps.STDOUT, stdout=sps.PIPE)\n",
"error5 = proc5.communicate()[0]"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/home/deren/Documents/ipyrad/bin/samtools-linux-x86_64',\n",
" 'sort',\n",
" '-T',\n",
" '/home/deren/Documents/ipyrad/tests/3-refsetest_refmapping/3K_0.sam.tmp',\n",
" '-O',\n",
" 'bam',\n",
" '-o',\n",
" '/home/deren/Documents/ipyrad/tests/3-refsetest_refmapping/3K_0-mapped-sorted.bam']"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample.files.mapped_reads = os.path.join(\n",
" data.dirs.refmapping,\n",
" \"{}-mapped-sorted.bam\".format(sample.name))\n",
"\n",
"# this is gonna catch mapped bam output from cmd2 and write to file\n",
"cmd3 = [\n",
" ip.bins.samtools, \"sort\", \n",
" \"-T\", os.path.join(data.dirs.refmapping, sample.name + \".sam.tmp\"),\n",
" \"-O\", \"bam\", \n",
" \"-o\", sample.files.mapped_reads]\n",
"cmd3"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"''"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.dirs.refmapping"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/home/deren/Documents/ipyrad/bin/bwa-linux-x86_64',\n",
" 'mem',\n",
" '-t',\n",
" '2',\n",
" '-M',\n",
" '/home/deren/Documents/ipyrad/tests/ipsimdata/rad_example_genome.fa']"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nthreads = 2\n",
"\n",
"\n",
"cmd1 = [\n",
" ip.bins.bwa, \"mem\",\n",
" \"-t\", str(max(1, nthreads)),\n",
" \"-M\",\n",
" data.paramsdict['reference_sequence']\n",
" ] \n",
"\n",
"cmd1\n",
"#cmd1 += sample.files.dereps\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[####################] 100% 0:00:01 | dereplicating | s3 |\n",
"[####################] 100% 0:00:01 | clustering/mapping | s3 |\n",
"[####################] 100% 0:00:00 | building clusters | s3 |\n",
"[####################] 100% 0:00:00 | chunking clusters | s3 |\n",
"[####################] 100% 0:00:15 | aligning clusters | s3 |\n",
"[####################] 100% 0:00:00 | concat clusters | s3 |\n"
]
}
],
"source": [
"s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)\n",
"sample = s3.samples[0]\n",
"s3.run()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"#derep_sort_map(s3.data, sample, s3.force, s3.nthreads)\n",
"sample.concatfiles = concat_multiple_edits(s3.data, sample)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('/home/deren/Documents/ipyrad/tests/2-setest_edits/3K_0.trimmed_R1_.fastq.gz',\n",
" 0)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample.mergedfile = merge_pairs(s3.data, sample, 1, 1) \n",
"sample.mergedfile"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "expected str, bytes or os.PathLike object, not list",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnew_derep_and_sort\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmergedfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtmpdir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m+\u001b[0m \u001b[0;34m\"_derep.fastq\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnthreads\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36mnew_derep_and_sort\u001b[0;34m(data, infile, outfile, nthreads)\u001b[0m\n\u001b[1;32m 427\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[0;31m## build PIPEd job\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 429\u001b[0;31m \u001b[0mproc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msps\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcmd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msps\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSTDOUT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstdout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msps\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPIPE\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclose_fds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 430\u001b[0m \u001b[0merrmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommunicate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 431\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mproc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/lib/python3.6/subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors)\u001b[0m\n\u001b[1;32m 707\u001b[0m \u001b[0mc2pread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc2pwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 708\u001b[0m \u001b[0merrread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 709\u001b[0;31m restore_signals, start_new_session)\n\u001b[0m\u001b[1;32m 710\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 711\u001b[0m \u001b[0;31m# Cleanup if the child failed starting.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/lib/python3.6/subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)\u001b[0m\n\u001b[1;32m 1273\u001b[0m \u001b[0merrread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1274\u001b[0m \u001b[0merrpipe_read\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrpipe_write\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1275\u001b[0;31m restore_signals, start_new_session, preexec_fn)\n\u001b[0m\u001b[1;32m 1276\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_child_created\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1277\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: expected str, bytes or os.PathLike object, not list"
]
}
],
"source": [
"new_derep_and_sort(s3.data, sample.mergedfile, os.path.join(s3.data.tmpdir, sample.name+ \"_derep.fastq\"), s3.nthreads)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"strand = \"plus\"\n",
"if s3.data.paramsdict[\"datatype\"] is ('gbs' or '2brad'):\n",
" strand = \"both\"\n",
"\n",
"\n",
"cmd = [\n",
" ip.bins.vsearch,\n",
" \"--derep_fulllength\", sample.mergedfile,\n",
" \"--strand\", strand,\n",
" \"--output\", outfile,\n",
" \"--threads\", str(nthreads),\n",
" \"--fasta_width\", str(0),\n",
" \"--fastq_qmax\", \"1000\",\n",
" \"--sizeout\", \n",
" \"--relabel_md5\",\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Assembly: pairtest\n",
"[####################] 100% 0:00:00 | inferring [H, E] | s4 |\n",
"\n",
" Encountered an unexpected error (see ./ipyrad_log.txt)\n",
" Error message is below -------------------------------\n",
"The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()\n"
]
}
],
"source": [
"s3.data.run(\"4\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[####################] 100% 0:00:01 | dereplicating | s3 |\n"
]
}
],
"source": [
"s3.remote_run_dereps()\n",
"#s3.remote_run_cluster_map_build()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"for sample in s3.samples:\n",
" cluster(s3.data, sample, s3.nthreads, s3.force)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "int() argument must be a string, a bytes-like object or a number, not 'list'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msample\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msamples\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mbuild_clusters\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmaxindels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36mbuild_clusters\u001b[0;34m(data, sample, maxindels)\u001b[0m\n\u001b[1;32m 872\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 873\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";size=\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 874\u001b[0;31m reverse=True) \n\u001b[0m\u001b[1;32m 875\u001b[0m \u001b[0mseqlist\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfseqs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 876\u001b[0m \u001b[0mseqsize\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36m\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m 871\u001b[0m fseqs = [fseqs[0]] + sorted(fseqs[1:], \n\u001b[1;32m 872\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 873\u001b[0;31m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";size=\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 874\u001b[0m reverse=True) \n\u001b[1;32m 875\u001b[0m \u001b[0mseqlist\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfseqs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: int() argument must be a string, a bytes-like object or a number, not 'list'"
]
}
],
"source": [
"for sample in s3.samples:\n",
" build_clusters(s3.data, sample, s3.maxindels)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"for sample in s3.samples:\n",
" muscle_chunker(s3.data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"aasyncs = {}\n",
"for sample in s3.samples:\n",
" aasyncs[sample.name] = []\n",
" for idx in range(10):\n",
" handle = os.path.join(s3.data.tmpdir, \n",
" \"{}_chunk_{}.ali\".format(sample.name, idx))\n",
" align_and_parse(handle, s3.maxindels, s3.gbs)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"def _get_derep_num(read):\n",
" \"return the number of replicates in a derep read\"\n",
" return int(read.split(\"=\")[-1].split(\"\\n\")[0][:-1])\n",
"\n",
"\n",
"\n",
"\n",
"def _aligned_indel_filter(clust, max_internal_indels):\n",
" \"\"\" checks for too many internal indels in muscle aligned clusters \"\"\"\n",
"\n",
" ## make into list\n",
" lclust = clust.split()\n",
" \n",
" ## paired or not\n",
" try:\n",
" seq1 = [i.split(\"nnnn\")[0] for i in lclust[1::2]]\n",
" seq2 = [i.split(\"nnnn\")[1] for i in lclust[1::2]]\n",
" intindels1 = [i.rstrip(\"-\").lstrip(\"-\").count(\"-\") for i in seq1]\n",
" intindels2 = [i.rstrip(\"-\").lstrip(\"-\").count(\"-\") for i in seq2]\n",
" intindels = intindels1 + intindels2\n",
" if max(intindels) > max_internal_indels:\n",
" return 1\n",
" except IndexError:\n",
" seq1 = lclust[1::2]\n",
" intindels = [i.rstrip(\"-\").lstrip(\"-\").count(\"-\") for i in seq1]\n",
" if max(intindels) > max_internal_indels:\n",
" return 1 \n",
" return 0\n"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"for sample in s3.samples:\n",
" reconcat(s3.data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def align_and_parse(handle, max_internal_indels=5, is_gbs=False):\n",
" \"\"\" much faster implementation for aligning chunks \"\"\"\n",
"\n",
" ## data are already chunked, read in the whole thing. bail if no data.\n",
" try:\n",
" with open(handle, 'rb') as infile:\n",
" clusts = infile.read().decode().split(\"//\\n//\\n\")\n",
" # remove any empty spots\n",
" clusts = [i for i in clusts if i]\n",
" # Skip entirely empty chunks\n",
" if not clusts:\n",
" raise IPyradError(\"no clusters in file: {}\".format(handle))\n",
"\n",
" except (IOError, IPyradError):\n",
" LOGGER.debug(\"skipping empty chunk - {}\".format(handle))\n",
" return 0\n",
"\n",
" ## count discarded clusters for printing to stats later\n",
" highindels = 0\n",
"\n",
" ## iterate over clusters sending each to muscle, splits and aligns pairs\n",
" aligned = _persistent_popen_align3(clusts, 200, is_gbs)\n",
"\n",
" ## store good alignments to be written to file\n",
" refined = []\n",
"\n",
" ## filter and trim alignments\n",
" for clust in aligned:\n",
" # check for too many internal indels\n",
" if not _aligned_indel_filter(clust, max_internal_indels):\n",
" refined.append(clust)\n",
" else:\n",
" highindels += 1\n",
"\n",
" ## write to file after\n",
" if refined:\n",
" outhandle = handle.rsplit(\".\", 1)[0] + \".aligned\"\n",
" with open(outhandle, 'wb') as outfile:\n",
" outfile.write(str.encode(\"\\n//\\n//\\n\".join(refined) + \"\\n\"))\n",
"\n",
" ## remove the old tmp file\n",
" if not LOGGER.getEffectiveLevel() == 10:\n",
" os.remove(handle)\n",
" return highindels"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def _persistent_popen_align3(clusts, maxseqs=200, is_gbs=False):\n",
" \"\"\" keeps a persistent bash shell open and feeds it muscle alignments \"\"\"\n",
"\n",
" ## create a separate shell for running muscle in, this is much faster\n",
" ## than spawning a separate subprocess for each muscle call\n",
" proc = sps.Popen(\n",
" [\"bash\"], \n",
" stdin=sps.PIPE, \n",
" stdout=sps.PIPE, \n",
" bufsize=0,\n",
" )\n",
"\n",
" ## iterate over clusters in this file until finished\n",
" aligned = []\n",
" for clust in clusts:\n",
"\n",
" ## new alignment string for read1s and read2s\n",
" align1 = []\n",
" align2 = []\n",
"\n",
" ## don't bother aligning if only one seq\n",
" if clust.count(\">\") == 1:\n",
" aligned.append(clust.replace(\">\", \"\").strip())\n",
" else:\n",
"\n",
" # do we need to split the alignment? (is there a PE insert?)\n",
" try:\n",
" # make into list (only read maxseqs lines, 2X cuz names)\n",
" lclust = clust.split()[:maxseqs * 2]\n",
"\n",
" # try to split cluster list at nnnn separator for each read\n",
" lclust1 = list(chain(*zip(\n",
" lclust[::2], [i.split(\"nnnn\")[0] for i in lclust[1::2]])))\n",
" lclust2 = list(chain(*zip(\n",
" lclust[::2], [i.split(\"nnnn\")[1] for i in lclust[1::2]])))\n",
"\n",
" # put back into strings\n",
" clust1 = \"\\n\".join(lclust1)\n",
" clust2 = \"\\n\".join(lclust2)\n",
"\n",
" # Align the first reads.\n",
" # The muscle command with alignment as stdin and // as split\n",
" cmd1 = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n",
" .format(clust1, ip.bins.muscle, \"//\\n\"))\n",
"\n",
" # send cmd1 to the bash shell\n",
" proc.stdin.write(cmd1.encode())\n",
"\n",
" # read the stdout by line until splitter is reached\n",
" # meaning that the alignment is finished.\n",
" for line in iter(proc.stdout.readline, b'//\\n'):\n",
" align1.append(line.decode())\n",
"\n",
" # Align the second reads.\n",
" # The muscle command with alignment as stdin and // as split\n",
" cmd2 = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n",
" .format(clust2, ip.bins.muscle, \"//\\n\"))\n",
"\n",
" # send cmd2 to the bash shell\n",
" proc.stdin.write(cmd2.encode())\n",
"\n",
" # read the stdout by line until splitter is reached\n",
" # meaning that the alignment is finished.\n",
" for line in iter(proc.stdout.readline, b'//\\n'):\n",
" align2.append(line.decode())\n",
"\n",
" # join up aligned read1 and read2 and ensure names order match\n",
" lines1 = \"\".join(align1)[1:].split(\"\\n>\")\n",
" lines2 = \"\".join(align2)[1:].split(\"\\n>\")\n",
" dalign1 = dict([i.split(\"\\n\", 1) for i in lines1])\n",
" dalign2 = dict([i.split(\"\\n\", 1) for i in lines2])\n",
"\n",
" # sort the first reads\n",
" keys = list(dalign1.keys())\n",
" seed = [i for i in keys if i[-1] == \"*\"][0]\n",
" keys.pop(keys.index(seed))\n",
" order = [seed] + sorted(\n",
" keys, key=_get_derep_num, reverse=True) \n",
"\n",
" # combine in order\n",
" alignpe = [] \n",
" for key in order:\n",
" alignpe.append(\"\\n\".join([\n",
" key, \n",
" dalign1[key].replace(\"\\n\", \"\") + \"nnnn\" + \\\n",
" dalign2[key].replace(\"\\n\", \"\")]))\n",
"\n",
" ## append aligned cluster string\n",
" aligned.append(\"\\n\".join(alignpe).strip())\n",
"\n",
" # Malformed clust. Dictionary creation with only 1 element \n",
" except ValueError as inst:\n",
" ip.logger.debug(\n",
" \"Bad PE cluster - {}\\nla1 - {}\\nla2 - {}\"\n",
" .format(clust, lines1, lines2))\n",
"\n",
" ## Either reads are SE, or at least some pairs are merged.\n",
" except IndexError:\n",
" \n",
" # limit the number of input seqs\n",
" # use lclust already built before checking pairs\n",
" lclust = \"\\n\".join(clust.split()[:maxseqs * 2])\n",
"\n",
" # the muscle command with alignment as stdin and // as splitter\n",
" cmd = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n",
" .format(lclust, ip.bins.muscle, \"//\\n\"))\n",
"\n",
" ## send cmd to the bash shell (TODO: PIPE could overflow here!)\n",
" proc.stdin.write(cmd.encode())\n",
"\n",
" ## read the stdout by line until // is reached. This BLOCKS.\n",
" for line in iter(proc.stdout.readline, b'//\\n'):\n",
" align1.append(line.decode())\n",
"\n",
" ## remove '>' from names, and '\\n' from inside long seqs \n",
" lines = \"\".join(align1)[1:].split(\"\\n>\")\n",
"\n",
" ## find seed of the cluster and put it on top.\n",
" seed = [i for i in lines if i.split(\";\")[-1][0] == \"*\"][0]\n",
" lines.pop(lines.index(seed))\n",
" lines = [seed] + sorted(\n",
" lines, key=_get_derep_num, reverse=True)\n",
"\n",
" ## format remove extra newlines from muscle\n",
" aa = [i.split(\"\\n\", 1) for i in lines]\n",
" align1 = [i[0] + '\\n' + \"\".join([j.replace(\"\\n\", \"\") \n",
" for j in i[1:]]) for i in aa]\n",
" \n",
" # trim edges in sloppy gbs/ezrad data. \n",
" # Maybe relevant to other types too...\n",
" if is_gbs:\n",
" align1 = _gbs_trim(align1)\n",
"\n",
" ## append to aligned\n",
" aligned.append(\"\\n\".join(align1))\n",
" \n",
" # cleanup\n",
" proc.stdout.close()\n",
" if proc.stderr:\n",
" proc.stderr.close()\n",
" proc.stdin.close()\n",
" proc.wait()\n",
"\n",
" ## return the aligned clusters\n",
" return aligned "
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "invalid literal for int() with base 10: 'f72cb70f788295d6632d5d18442'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msample\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msamples\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mip\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massemble\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclustmap\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sample_cleanup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36m_sample_cleanup\u001b[0;34m(data, sample)\u001b[0m\n\u001b[1;32m 1262\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1263\u001b[0m \u001b[0;31m# get maxlen and depths array from clusters\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1264\u001b[0;31m \u001b[0mmaxlens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdepths\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_quick_depths\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1265\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1266\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36m_get_quick_depths\u001b[0;34m(data, sample)\u001b[0m\n\u001b[1;32m 1250\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1251\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1252\u001b[0;31m \u001b[0mtdepth\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1253\u001b[0m \u001b[0mtlen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1254\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: 'f72cb70f788295d6632d5d18442'"
]
}
],
"source": [
"for sample in s3.samples:\n",
" ip.assemble.clustmap._sample_cleanup(s3.data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "invalid literal for int() with base 10: 'f72cb70f788295d6632d5d18442'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mip\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massemble\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclustmap\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_quick_depths\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36m_get_quick_depths\u001b[0;34m(data, sample)\u001b[0m\n\u001b[1;32m 1250\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1251\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1252\u001b[0;31m \u001b[0mtdepth\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\";\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1253\u001b[0m \u001b[0mtlen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1254\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: 'f72cb70f788295d6632d5d18442'"
]
}
],
"source": [
"ip.assemble.clustmap._get_quick_depths(s3.data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"if sample.files.get('clusters'):\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"fclust = data.samples[sample.name].files.clusters\n",
"clusters = gzip.open(fclust, 'rt')\n",
"pairdealer = izip(*[iter(clusters)] * 2)\n",
"\n",
"## storage\n",
"depths = []\n",
"maxlen = []\n",
"\n",
"## start with cluster 0\n",
"tdepth = 0\n",
"tlen = 0\n",
"\n",
"## iterate until empty\n",
"while 1:\n",
" ## grab next\n",
" try:\n",
" name, seq = next(pairdealer)\n",
" except StopIteration:\n",
" break\n",
"\n",
" ## if not the end of a cluster\n",
" #print name.strip(), seq.strip()\n",
" #print(name)\n",
" if name.strip() == seq.strip():\n",
" depths.append(tdepth)\n",
" maxlen.append(tlen)\n",
" tlen = 0\n",
" tdepth = 0\n",
"\n",
" else:\n",
" tdepth += int(name.strip().split(\"=\")[-1][:-1])\n",
" tlen = len(seq)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'1D_0'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample.name\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[####################] 100% 0:00:02 | dereplicating | s3 |\n"
]
}
],
"source": [
"s3.remote_run_dereps()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[####################] 100% 0:00:05 | clustering/mapping | s3 |\n",
"[####################] 100% 0:00:00 | building clusters | s3 |\n",
"[####################] 100% 0:00:00 | chunking clusters | s3 |\n"
]
}
],
"source": [
"s3.remote_run_cluster_map_build()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[####################] 100% 0:00:02 | aligning clusters | s3 |\n"
]
},
{
"ename": "IPyradError",
"evalue": "TypeError(a bytes-like object is required, not 'str')",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIPyradError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mremote_run_align_cleanup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36mremote_run_align_cleanup\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mjob\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mallasyncs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 334\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msuccessful\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 335\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mIPyradError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 336\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0;31m# track job progress\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mIPyradError\u001b[0m: TypeError(a bytes-like object is required, not 'str')"
]
}
],
"source": [
"s3.remote_run_align_cleanup()"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"handle = \"pairtest-tmpalign/1A_0_chunk_0.ali\"\n",
"with open(handle, 'rb') as infile:\n",
" clusts = infile.read().decode().split(\"//\\n//\\n\")\n",
" # remove any empty spots\n",
" clusts = [i for i in clusts if i]\n",
" # Skip entirely empty chunks\n",
" if not clusts:\n",
" raise IPyradError(\"no clusters in file: {}\".format(handle))\n"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
"maxseqs = 200\n",
"is_gbs = False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"proc = sps.Popen(\n",
" [\"bash\"], \n",
" stdin=sps.PIPE, \n",
" stdout=sps.PIPE, \n",
" bufsize=0,\n",
" )\n",
"\n",
"## iterate over clusters in this file until finished\n",
"aligned = []\n",
"for clust in clusts:\n",
"\n",
" ## new alignment string for read1s and read2s\n",
" align1 = []\n",
" align2 = []\n",
"\n",
" ## don't bother aligning if only one seq\n",
" if clust.count(\">\") == 1:\n",
" aligned.append(clust.replace(\">\", \"\").strip())\n",
" else:\n",
"\n",
" # do we need to split the alignment? (is there a PE insert?)\n",
" try:\n",
" # make into list (only read maxseqs lines, 2X cuz names)\n",
" lclust = clust.split()[:maxseqs * 2]\n",
"\n",
" # try to split cluster list at nnnn separator for each read\n",
" lclust1 = list(chain(*zip(\n",
" lclust[::2], [i.split(\"nnnn\")[0] for i in lclust[1::2]])))\n",
" lclust2 = list(chain(*zip(\n",
" lclust[::2], [i.split(\"nnnn\")[1] for i in lclust[1::2]])))\n",
"\n",
" # put back into strings\n",
" clust1 = \"\\n\".join(lclust1)\n",
" clust2 = \"\\n\".join(lclust2)\n",
"\n",
" # Align the first reads.\n",
" # The muscle command with alignment as stdin and // as split\n",
" cmd1 = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n",
" .format(clust1, ip.bins.muscle, \"//\\n\"))\n",
"\n",
" # send cmd1 to the bash shell\n",
" proc.stdin.write(cmd1.encode())\n",
"\n",
" # read the stdout by line until splitter is reached\n",
" # meaning that the alignment is finished.\n",
" for line in iter(proc.stdout.readline, '//\\n'):\n",
" align1.append(line)\n",
"\n",
" # Align the second reads.\n",
" # The muscle command with alignment as stdin and // as split\n",
" cmd2 = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n",
" .format(clust2, ip.bins.muscle, \"//\\n\"))\n",
"\n",
" # send cmd2 to the bash shell\n",
" proc.stdin.write(cmd2.encode())\n",
"\n",
" # read the stdout by line until splitter is reached\n",
" # meaning that the alignment is finished.\n",
" for line in iter(proc.stdout.readline, b'//\\n'):\n",
" align2.append(line)\n",
"\n",
" # join up aligned read1 and read2 and ensure names order match\n",
" lines1 = \"\".join(align1)[1:].split(\"\\n>\")\n",
" lines2 = \"\".join(align2)[1:].split(\"\\n>\")\n",
" dalign1 = dict([i.split(\"\\n\", 1) for i in lines1])\n",
" dalign2 = dict([i.split(\"\\n\", 1) for i in lines2])\n",
"\n",
" # sort the first reads\n",
" keys = list(dalign1.keys())\n",
" seed = [i for i in keys if i[-1] == \"*\"][0]\n",
" keys.pop(keys.index(seed))\n",
" order = [seed] + sorted(\n",
" keys, key=_get_derep_num, reverse=True) \n",
"\n",
" # combine in order\n",
" for key in order:\n",
" align1.append(\"\\n\".join([\n",
" key, \n",
" dalign1[key].replace(\"\\n\", \"\") + \"nnnn\" + \\\n",
" dalign2[key].replace(\"\\n\", \"\")]))\n",
"\n",
" ## append aligned cluster string\n",
" aligned.append(\"\\n\".join(align1).strip())\n",
"\n",
" # Malformed clust. Dictionary creation with only 1 element \n",
" except ValueError as inst:\n",
" ip.logger.debug(\n",
" \"Bad PE cluster - {}\\nla1 - {}\\nla2 - {}\"\n",
" .format(clust, lines1, lines2))\n",
"\n",
" ## Either reads are SE, or at least some pairs are merged.\n",
" except IndexError:\n",
"\n",
" # limit the number of input seqs\n",
" # use lclust already built before checking pairs\n",
" lclust = \"\\n\".join(clust.split()[:maxseqs * 2])\n",
"\n",
" # the muscle command with alignment as stdin and // as splitter\n",
" cmd = (\"echo -e '{}' | {} -quiet -in - ; echo {}\"\n",
" .format(lclust, ip.bins.muscle, \"//\\n\"))\n",
"\n",
" ## send cmd to the bash shell (TODO: PIPE could overflow here!)\n",
" proc.stdin.write(cmd.encode())\n",
"\n",
" ## read the stdout by line until // is reached. This BLOCKS.\n",
" for line in iter(proc.stdout.readline, b'//\\n'):\n",
" align1.append(line.decode())\n",
"\n",
" ## remove '>' from names, and '\\n' from inside long seqs \n",
" lines = \"\".join(align1)[1:].split(\"\\n>\")\n",
"\n",
" ## find seed of the cluster and put it on top.\n",
" seed = [i for i in lines if i.split(\";\")[-1][0] == \"*\"][0]\n",
" lines.pop(lines.index(seed))\n",
" lines = [seed] + sorted(\n",
" lines, key=_get_derep_num, reverse=True)\n",
"\n",
" ## format remove extra newlines from muscle\n",
" aa = [i.split(\"\\n\", 1) for i in lines]\n",
" align1 = [i[0] + '\\n' + \"\".join([j.replace(\"\\n\", \"\") \n",
" for j in i[1:]]) for i in aa]\n",
"\n",
" # trim edges in sloppy gbs/ezrad data. \n",
" # Maybe relevant to other types too...\n",
" if is_gbs:\n",
" align1 = _gbs_trim(align1)\n",
"\n",
" ## append to aligned\n",
" aligned.append(\"\\n\".join(align1))\n",
"\n",
"# cleanup\n",
"proc.stdout.close()\n",
"if proc.stderr:\n",
" proc.stderr.close()\n",
"proc.stdin.close()\n",
"proc.wait()\n",
"\n",
"## return the aligned clusters\n",
"#return aligned "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"from ipyrad.assemble.clustmap import _get_derep_num"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"# join up aligned read1 and read2 and ensure names order match\n",
"lines1 = \"\".join(align1)[1:].split(\"\\n>\")\n",
"lines2 = \"\".join(align2)[1:].split(\"\\n>\")\n",
"dalign1 = dict([i.split(\"\\n\", 1) for i in lines1])\n",
"dalign2 = dict([i.split(\"\\n\", 1) for i in lines2])\n",
"\n",
"# sort the first reads\n",
"keys = list(dalign1.keys())\n",
"seed = [i for i in keys if i[-1] == \"*\"][0]\n",
"keys.pop(keys.index(seed))\n",
"order = [seed] + sorted(\n",
" keys, key=_get_derep_num, reverse=True) "
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
" # join up aligned read1 and read2 and ensure names order match\n",
" lines1 = \"\".join(align1)[1:].split(\"\\n>\")\n",
" lines2 = \"\".join(align2)[1:].split(\"\\n>\")\n",
" dalign1 = dict([i.split(\"\\n\", 1) for i in lines1])\n",
" dalign2 = dict([i.split(\"\\n\", 1) for i in lines2])\n",
"\n",
" # sort the first reads\n",
" keys = list(dalign1.keys())\n",
" seed = [i for i in keys if i[-1] == \"*\"][0]\n",
" keys.pop(keys.index(seed))\n",
" order = [seed] + sorted(\n",
" keys, key=_get_derep_num, reverse=True) \n",
"\n",
" # combine in order\n",
" for key in order:\n",
" align1.append(\"\\n\".join([\n",
" key, \n",
" dalign1[key].replace(\"\\n\", \"\") + \"nnnn\" + \\\n",
" dalign2[key].replace(\"\\n\", \"\")]))\n",
"\n",
" ## append aligned cluster string\n",
" aligned.append(\"\\n\".join(align1).strip())"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['>12f1ffdaa3a3d7c8310998dea05a56dd;size=22;*\\n\\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\\n\\nTATATACAAAGACGGATGTGGTGCGAAATAC\\n\\n>1f4df54209f2d8d9be6019fde95d7579;size=1;+\\n\\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\\n\\nTATATACAAAGACGGATGTGGTGCGAAATAC\\n\\n>4b85ee58466075729a68837e2b016ad7;size=1;+\\n\\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\\n\\nTATATACAAAGACGGATGTGGTGCGAAATAC\\n\\n12f1ffdaa3a3d7c8310998dea05a56dd;size=22;*\\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGTTATATACAAAGACGGATGTGGTGCGAAATACnnnnGGCTCCTATTTCAAGTACCGTCTAATGTCAATAAGATGGTTTCGATGCGTGGAGAGAAACCCACTCTGAACGTCCCGATCACAGCGTTGGCTCTACTCCG\\n1f4df54209f2d8d9be6019fde95d7579;size=1;+\\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGTTATATACAAAGACGGATGTGGTGCGAAATACnnnnGGCTCCTATTTCAAGTACCGTCTAATGTCAATAAGATGGTTTCGATGCGTGGAGAGAAACCCACTCTGAACGTCCCGATCACAGCGTTCGCTCTACTCCG\\n4b85ee58466075729a68837e2b016ad7;size=1;+\\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGTTATATACAAAGACGGATGTGGTGCGAAATACnnnnGGCTCCTATTTCAAGTACCGTCTAATGTCAATAAGATGGTTTCGATGCGTGGAGAGAAACCCACTCTAAACGTCCCGATCACAGCGTTGGCTCTACTCCG']"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"aligned"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"## remove '>' from names, and '\\n' from inside long seqs \n",
"lines1 = \"\".join(align1)[1:].split(\"\\n>\")\n",
"seed = [i for i in lines1 if i.split(\";\")[-1][0] == \"*\"][0]\n",
"lines1.pop(lines1.index(seed))\n",
"lines1 = [seed] + sorted(\n",
" lines1, key=_get_derep_num, reverse=True)\n",
"dalign1 = dict([i.split(\"\\n\", 1) for i in lines1])\n",
"\n",
"\n",
"lines2 = \"\".join(align2)[1:].split(\"\\n>\")\n",
"dalign2 = dict([i.split(\"\\n\", 1) for i in lines2])\n",
"#seed = [i for i in lines2 if i.split(\";\")[-1][0] == \"*\"][0]\n",
"#lines2.pop(lines2.index(seed))\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"## format remove extra newlines from muscle\n",
"aa = [i.split(\"\\n\", 1) for i in lines]\n",
"align1 = [i[0] + '\\n' + \"\".join([j.replace(\"\\n\", \"\") \n",
" for j in i[1:]]) for i in aa]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['>12f1ffdaa3a3d7c8310998dea05a56dd;size=22;*\\n',\n",
" 'TGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\\n',\n",
" 'TATATACAAAGACGGATGTGGTGCGAAATAC\\n',\n",
" '>1f4df54209f2d8d9be6019fde95d7579;size=1;+\\n',\n",
" 'TGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\\n',\n",
" 'TATATACAAAGACGGATGTGGTGCGAAATAC\\n',\n",
" '>4b85ee58466075729a68837e2b016ad7;size=1;+\\n',\n",
" 'TGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\\n',\n",
" 'TATATACAAAGACGGATGTGGTGCGAAATAC\\n']"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"align1"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"derephandle = os.path.join(data.tmpdir, sample.name + \"_derep.fastq\")\n",
"uhandle = os.path.join(data.dirs.clusts, sample.name + \".utemp\")\n",
"temphandle = os.path.join(data.dirs.clusts, sample.name + \".htemp\")\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"derepfile = os.path.join(data.tmpdir, sample.name + \"_derep.fastq\")\n",
"uhandle = os.path.join(data.dirs.clusts, sample.name + \".utemp\")\n",
"usort = os.path.join(data.dirs.clusts, sample.name + \".utemp.sort\")\n",
"hhandle = os.path.join(data.dirs.clusts, sample.name + \".htemp\")\n",
"sample.files.clusters = os.path.join(\n",
" data.dirs.clusts, sample.name + \".clust.gz\")\n",
"clustsout = gzip.open(sample.files.clusters, 'wt')\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"cmd = [\"sort\", \"-k\", \"2\", uhandle, \"-o\", usort]\n",
"proc = sps.Popen(cmd, close_fds=True)\n",
"proc.communicate()[0]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"alldereps = {}\n",
"with open(derepfile, 'rt') as ioderep:\n",
" dereps = izip(*[iter(ioderep)] * 2)\n",
" for namestr, seq in dereps:\n",
" nnn, sss = [i.strip() for i in (namestr, seq)] \n",
" alldereps[nnn[1:]] = sss"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "invalid literal for int() with base 10: '1+\\nTGCAGGTCACTTTTCAAGATACACTATTGTTATTACTGTGAGACACAAAGCTAATTCATCACTTCACGGATACCGCGTCCTCCTATAACGCnnnnCAATATTAACGCGTGAGTACCGGTTTCCTTGTGAGGAAGGCCCACTCTCAGTACCACCCTTATCCTATTCTAAGGCACACATGCATAGACCACTCAACCG",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 25\u001b[0m fseqs = [fseqs[0]] + sorted(fseqs[1:], \n\u001b[1;32m 26\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m int(x.split(\";size=\")[1].split(\";\")[0]), reverse=True) \n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0mseqlist\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfseqs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0mseqsize\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m 25\u001b[0m fseqs = [fseqs[0]] + sorted(fseqs[1:], \n\u001b[1;32m 26\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m int(x.split(\";size=\")[1].split(\";\")[0]), reverse=True) \n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0mseqlist\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfseqs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0mseqsize\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: '1+\\nTGCAGGTCACTTTTCAAGATACACTATTGTTATTACTGTGAGACACAAAGCTAATTCATCACTTCACGGATACCGCGTCCTCCTATAACGCnnnnCAATATTAACGCGTGAGTACCGGTTTCCTTGTGAGGAAGGCCCACTCTCAGTACCACCCTTATCCTATTCTAAGGCACACATGCATAGACCACTCAACCG"
]
}
],
"source": [
"seedsseen = set()\n",
"maxindels = 8\n",
"\n",
"## Iterate through the usort file grabbing matches to build clusters\n",
"with open(usort, 'rt') as insort:\n",
" ## iterator, seed null, seqlist null\n",
" isort = iter(insort)\n",
" lastseed = 0\n",
" fseqs = []\n",
" seqlist = []\n",
" seqsize = 0\n",
" while 1:\n",
" ## grab the next line\n",
" try:\n",
" hit, seed, _, ind, ori, _ = next(isort).strip().split()\n",
" except StopIteration:\n",
" break\n",
"\n",
" ## same seed, append match\n",
" if seed != lastseed:\n",
" seedsseen.add(seed)\n",
" ## store the last cluster (fseq), count it, and clear fseq\n",
" if fseqs:\n",
" ## sort fseqs by derep after pulling out the seed\n",
" fseqs = [fseqs[0]] + sorted(fseqs[1:], \n",
" key=lambda x: \n",
" int(x.split(\";size=\")[1].split(\";\")[0]), reverse=True) \n",
" seqlist.append(\"\\n\".join(fseqs))\n",
" seqsize += 1\n",
" fseqs = []\n",
"\n",
" # occasionally write/dump stored clusters to file and clear mem\n",
" if not seqsize % 10000:\n",
" if seqlist:\n",
" clustsout.write(\n",
" \"\\n//\\n//\\n\".join(seqlist) + \"\\n//\\n//\\n\")\n",
" ## reset list and counter\n",
" seqlist = []\n",
"\n",
" ## store the new seed on top of fseq list\n",
" fseqs.append(\">{}*\\n{}\".format(seed, alldereps[seed]))\n",
" lastseed = seed\n",
"\n",
" ## add match to the seed\n",
" ## revcomp if orientation is reversed (comp preserves nnnn)\n",
" if ori == \"-\":\n",
" seq = comp(alldereps[hit])[::-1]\n",
" else:\n",
" seq = alldereps[hit]\n",
" ## only save if not too many indels\n",
" if int(ind) <= maxindels:\n",
" fseqs.append(\">{}{}\\n{}\".format(hit, ori, seq))\n",
" else:\n",
" ip.logger.info(\"filtered by maxindels: %s %s\", ind, seq)\n",
"\n",
"## write whatever is left over to the clusts file\n",
"if fseqs:\n",
" seqlist.append(\"\\n\".join(fseqs))\n",
"if seqlist:\n",
" clustsout.write(\"\\n//\\n//\\n\".join(seqlist) + \"\\n//\\n//\\n\")\n",
"\n",
"## now write the seeds that had no hits. Make dict from htemp\n",
"with open(hhandle, 'rt') as iotemp:\n",
" nohits = izip(*[iter(iotemp)] * 2)\n",
" seqlist = []\n",
" seqsize = 0\n",
" while 1:\n",
" try:\n",
" nnn, _ = [i.strip() for i in next(nohits)]\n",
" except StopIteration:\n",
" break\n",
"\n",
" ## occasionally write to file\n",
" if not seqsize % 10000:\n",
" if seqlist:\n",
" clustsout.write(\"\\n//\\n//\\n\".join(seqlist) + \"\\n//\\n//\\n\")\n",
" ## reset list and counter\n",
" seqlist = []\n",
"\n",
" ## append to list if new seed\n",
" if nnn[1:] not in seedsseen:\n",
" seqlist.append(\"{}*\\n{}\".format(nnn, alldereps[nnn[1:]]))\n",
" seqsize += 1\n",
"\n",
"## write whatever is left over to the clusts file\n",
"if seqlist:\n",
" clustsout.write(\"\\n//\\n//\\n\".join(seqlist))\n",
"\n",
"## close the file handle\n",
"clustsout.close()\n",
"del alldereps"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'00519cc27c6ab8f71dcdef028ad184e8;size=12',\n",
" '00a2c3fa80127d8adfc783464de05df4;size=10'}"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"seedsseen"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"sample.concatfiles = concat_multiple_edits(data, sample)\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"sample.mergedfile = merge_pairs(data, sample, 1, 1) \n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"new_derep_and_sort(\n",
" data,\n",
" sample.mergedfile,\n",
" os.path.join(data.tmpdir, sample.name + \"_derep.fastq\"),\n",
" 2)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"data = data\n",
"sample = sample\n",
"revcomp = 1\n",
"vsearch_merge = 1\n",
"\n",
"sample.concatfiles = concat_multiple_edits(data, sample)\n",
"sample.mergefile = os.path.join(data.tmpdir, sample.name + \"_merged_.fastq\") "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"sample.mergefile = os.path.join(data.tmpdir, sample.name + \"_merged_.fastq\") \n",
"if 'pair' in data.paramsdict['datatype']:\n",
" if \"reference\" not in data.paramsdict[\"assembly_method\"]:\n",
" nmerged = ip.assemble.clustmap._merge_pairs(data, sample, 1, 1)\n",
" else:\n",
" nmerged = 0 # _merge_pairs(data, sample, 0, 0) \n",
" sample.stats.reads_merged = nmerged"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"new_derep_and_sort(data, sampl)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def new_derep_and_sort(data, infile, outfile, nthreads):\n",
" \"\"\"\n",
" Dereplicates reads and sorts so reads that were highly replicated are at\n",
" the top, and singletons at bottom, writes output to derep file. Paired\n",
" reads are dereplicated as one concatenated read and later split again.\n",
" Updated this function to take infile and outfile to support the double\n",
" dereplication that we need for 3rad (5/29/15 iao).\n",
" \"\"\"\n",
" ## datatypes options\n",
" strand = \"plus\"\n",
" if data.paramsdict[\"datatype\"] is ('gbs' or '2brad'):\n",
" strand = \"both\"\n",
"\n",
" ## do dereplication with vsearch\n",
" cmd = [\n",
" ip.bins.vsearch,\n",
" \"--derep_fulllength\", infile,\n",
" \"--strand\", strand,\n",
" \"--output\", outfile,\n",
" \"--threads\", str(nthreads),\n",
" \"--fasta_width\", str(0),\n",
" \"--fastq_qmax\", \"1000\",\n",
" \"--sizeout\", \n",
" \"--relabel_md5\",\n",
" ]\n",
" ip.logger.info(\"derep cmd %s\", cmd)\n",
"\n",
" ## build PIPEd job\n",
" proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)\n",
" errmsg = proc.communicate()[0]\n",
" if proc.returncode:\n",
" ip.logger.error(\"error inside derep_and_sort %s\", errmsg)\n",
" raise IPyradWarningExit(errmsg)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"new_derep_and_sort()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"## CONCAT FILES FOR MERGED ASSEMBLIES\n",
"mergefile = os.path.join(data.tmpdir, sample.name + \"_merged_.fastq\")\n",
"sample.files.edits = concat_multiple_edits(data, sample)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"ename": "IndexError",
"evalue": "tuple index out of range",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfiles\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0medits\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmergefile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m nmerged = ip.assemble.clustmap._merge_pairs(\n\u001b[0;32m----> 3\u001b[0;31m data, sample.files.edits, mergefile, 1, 1)\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreads_merged\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnmerged\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Documents/ipyrad/ipyrad/assemble/clustmap.py\u001b[0m in \u001b[0;36m_merge_pairs\u001b[0;34m(data, two_files, merged_out, revcomp, merge)\u001b[0m\n\u001b[1;32m 693\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 694\u001b[0m \u001b[0mtmp1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtwo_files\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 695\u001b[0;31m \u001b[0mtmp2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtwo_files\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 696\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 697\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mIndexError\u001b[0m: tuple index out of range"
]
}
],
"source": [
"sample.files.edits = [(mergefile, )]\n",
"nmerged = ip.assemble.clustmap._merge_pairs(\n",
" data, sample.files.edits, mergefile, 1, 1)\n",
"sample.stats.reads_merged = nmerged"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/home/deren/Documents/ipyrad/pairtest-tmpalign/2E_0_merged_.fastq'"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mergefile"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('/home/deren/Documents/ipyrad/pairtest-tmpalign/2E_0_merged_.fastq',)]"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample.files.edits"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[####################] 100% 0:00:00 | dereplicating | s3 |\n",
"[####################] 100% 0:00:01 | clustering/mapping | s3 |\n",
"[####################] 100% 0:00:00 | building clusters | s3 |\n",
"[####################] 100% 0:00:00 | chunking clusters | s3 |\n",
"[####################] 100% 0:00:15 | aligning clusters | s3 |\n",
"[####################] 100% 0:00:00 | concat clusters | s3 |\n"
]
}
],
"source": [
"s3.run()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# muscle align returns values for bad alignments\n",
"ip.assemble.cluster_within.sample_cleanup(data, samples[0])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" clusters_total | \n",
" hidepth_min | \n",
" clusters_hidepth | \n",
" avg_depth_total | \n",
" avg_depth_mj | \n",
" avg_depth_stat | \n",
" sd_depth_total | \n",
" sd_depth_mj | \n",
" sd_depth_stat | \n",
" filtered_bad_align | \n",
"
\n",
" \n",
" \n",
" \n",
" 1A_0 | \n",
" 1000.0 | \n",
" 6.0 | \n",
" 1000.0 | \n",
" 19.862 | \n",
" 19.862 | \n",
" 19.862 | \n",
" 2.830717 | \n",
" 2.830717 | \n",
" 2.830717 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1B_0 | \n",
" 1000.0 | \n",
" 6.0 | \n",
" 1000.0 | \n",
" 20.043 | \n",
" 20.043 | \n",
" 20.043 | \n",
" 2.807339 | \n",
" 2.807339 | \n",
" 2.807339 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1C_0 | \n",
" 1000.0 | \n",
" 6.0 | \n",
" 1000.0 | \n",
" 20.136 | \n",
" 20.136 | \n",
" 20.136 | \n",
" 2.874283 | \n",
" 2.874283 | \n",
" 2.874283 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1D_0 | \n",
" 1000.0 | \n",
" 6.0 | \n",
" 1000.0 | \n",
" 19.966 | \n",
" 19.966 | \n",
" 19.966 | \n",
" 2.738402 | \n",
" 2.738402 | \n",
" 2.738402 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2E_0 | \n",
" 1000.0 | \n",
" 6.0 | \n",
" 1000.0 | \n",
" 20.017 | \n",
" 20.017 | \n",
" 20.017 | \n",
" 2.778977 | \n",
" 2.778977 | \n",
" 2.778977 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2F_0 | \n",
" 1000.0 | \n",
" 6.0 | \n",
" 1000.0 | \n",
" 19.933 | \n",
" 19.933 | \n",
" 19.933 | \n",
" 2.833463 | \n",
" 2.833463 | \n",
" 2.833463 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2G_0 | \n",
" 1000.0 | \n",
" 6.0 | \n",
" 1000.0 | \n",
" 20.030 | \n",
" 20.030 | \n",
" 20.030 | \n",
" 2.773644 | \n",
" 2.773644 | \n",
" 2.773644 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2H_0 | \n",
" 1000.0 | \n",
" 6.0 | \n",
" 1000.0 | \n",
" 20.199 | \n",
" 20.199 | \n",
" 20.199 | \n",
" 2.870087 | \n",
" 2.870087 | \n",
" 2.870087 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3I_0 | \n",
" 1000.0 | \n",
" 6.0 | \n",
" 1000.0 | \n",
" 19.885 | \n",
" 19.885 | \n",
" 19.885 | \n",
" 3.012603 | \n",
" 3.012603 | \n",
" 3.012603 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3J_0 | \n",
" 1000.0 | \n",
" 6.0 | \n",
" 1000.0 | \n",
" 19.822 | \n",
" 19.822 | \n",
" 19.822 | \n",
" 2.878596 | \n",
" 2.878596 | \n",
" 2.878596 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3K_0 | \n",
" 1000.0 | \n",
" 6.0 | \n",
" 1000.0 | \n",
" 19.965 | \n",
" 19.965 | \n",
" 19.965 | \n",
" 2.885788 | \n",
" 2.885788 | \n",
" 2.885788 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3L_0 | \n",
" 1000.0 | \n",
" 6.0 | \n",
" 1000.0 | \n",
" 20.008 | \n",
" 20.008 | \n",
" 20.008 | \n",
" 2.904813 | \n",
" 2.904813 | \n",
" 2.904813 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" clusters_total hidepth_min clusters_hidepth avg_depth_total \\\n",
"1A_0 1000.0 6.0 1000.0 19.862 \n",
"1B_0 1000.0 6.0 1000.0 20.043 \n",
"1C_0 1000.0 6.0 1000.0 20.136 \n",
"1D_0 1000.0 6.0 1000.0 19.966 \n",
"2E_0 1000.0 6.0 1000.0 20.017 \n",
"2F_0 1000.0 6.0 1000.0 19.933 \n",
"2G_0 1000.0 6.0 1000.0 20.030 \n",
"2H_0 1000.0 6.0 1000.0 20.199 \n",
"3I_0 1000.0 6.0 1000.0 19.885 \n",
"3J_0 1000.0 6.0 1000.0 19.822 \n",
"3K_0 1000.0 6.0 1000.0 19.965 \n",
"3L_0 1000.0 6.0 1000.0 20.008 \n",
"\n",
" avg_depth_mj avg_depth_stat sd_depth_total sd_depth_mj \\\n",
"1A_0 19.862 19.862 2.830717 2.830717 \n",
"1B_0 20.043 20.043 2.807339 2.807339 \n",
"1C_0 20.136 20.136 2.874283 2.874283 \n",
"1D_0 19.966 19.966 2.738402 2.738402 \n",
"2E_0 20.017 20.017 2.778977 2.778977 \n",
"2F_0 19.933 19.933 2.833463 2.833463 \n",
"2G_0 20.030 20.030 2.773644 2.773644 \n",
"2H_0 20.199 20.199 2.870087 2.870087 \n",
"3I_0 19.885 19.885 3.012603 3.012603 \n",
"3J_0 19.822 19.822 2.878596 2.878596 \n",
"3K_0 19.965 19.965 2.885788 2.885788 \n",
"3L_0 20.008 20.008 2.904813 2.904813 \n",
"\n",
" sd_depth_stat filtered_bad_align \n",
"1A_0 2.830717 0.0 \n",
"1B_0 2.807339 0.0 \n",
"1C_0 2.874283 0.0 \n",
"1D_0 2.738402 0.0 \n",
"2E_0 2.778977 0.0 \n",
"2F_0 2.833463 0.0 \n",
"2G_0 2.773644 0.0 \n",
"2H_0 2.870087 0.0 \n",
"3I_0 3.012603 0.0 \n",
"3J_0 2.878596 0.0 \n",
"3K_0 2.885788 0.0 \n",
"3L_0 2.904813 0.0 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data._build_stat(\"s3\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "IPyradError",
"evalue": "hi",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIPyradError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mIPyradError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"hi\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mIPyradError\u001b[0m: hi"
]
}
],
"source": [
"raise IPyradError(\"hi\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "derep_sort_map() missing 1 required positional argument: 'nthreads'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mself\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mderep_sort_map\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msamples\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m: derep_sort_map() missing 1 required positional argument: 'nthreads'"
]
}
],
"source": [
"self = data\n",
"derep_sort_map(s3.data, samples[0], s3.force)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ra.exception()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.get_params()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}