{ "cells": [ { "cell_type": "markdown", "metadata": { "toc": true }, "source": [ "

Table of Contents

\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test `Targets.align_and_parse`\n", "This notebook makes sure that running `Targets.align_and_parse` returns the same output as running `Targets.align` and `Targets.parse_alignment` separately." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup\n", "Import Python modules:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import warnings\n", "\n", "import pandas as pd\n", "\n", "import alignparse.ccs\n", "import alignparse.minimap2\n", "import alignparse.targets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Suppress warnings that clutter output:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "warnings.simplefilter('ignore')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Directory for output:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "outdir_base = '_temp_check_Targets_align_and_parse'\n", "outdir_separate = os.path.join(outdir_base, 'separate/')\n", "outdir_together = os.path.join(outdir_base, 'together/')\n", "os.makedirs(outdir_separate, exist_ok=True)\n", "os.makedirs(outdir_together, exist_ok=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get Target:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "recA_targetfile = '../notebooks/input_files/recA_amplicon.gb'\n", "recA_parse_specs_file = '../notebooks/input_files/recA_feature_parse_specs.yaml'\n", "targets = alignparse.targets.Targets(\n", " seqsfile=recA_targetfile,\n", " feature_parse_specs=recA_parse_specs_file)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the PacBio runs:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namereportfastq
0recA_lib-1../notebooks/input_files/recA_lib-1_report.txt../notebooks/input_files/recA_lib-1_ccs.fastq
1recA_lib-2../notebooks/input_files/recA_lib-2_report.txt../notebooks/input_files/recA_lib-2_ccs.fastq
\n", "
" ], "text/plain": [ " name report \\\n", "0 recA_lib-1 ../notebooks/input_files/recA_lib-1_report.txt \n", "1 recA_lib-2 ../notebooks/input_files/recA_lib-2_report.txt \n", "\n", " fastq \n", "0 ../notebooks/input_files/recA_lib-1_ccs.fastq \n", "1 ../notebooks/input_files/recA_lib-2_ccs.fastq " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "run_names = ['recA_lib-1', 'recA_lib-2']\n", "ccs_dir = 'input_files'\n", "\n", "pacbio_runs = pd.DataFrame(\n", " {'name': run_names,\n", " 'report': [f\"../notebooks/{ccs_dir}/{name}_report.txt\"\n", " for name in run_names],\n", " 'fastq': [f\"../notebooks/{ccs_dir}/{name}_ccs.fastq\"\n", " for name in run_names]\n", " })\n", "\n", "pacbio_runs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a `Mapper`:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "mapper = alignparse.minimap2.Mapper(alignparse.minimap2.OPTIONS_CODON_DMS)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Align and parse separately\n", "First, add the names of the desired alignment files to our data frame:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namereportfastqalignments
0recA_lib-1../notebooks/input_files/recA_lib-1_report.txt../notebooks/input_files/recA_lib-1_ccs.fastq_temp_check_Targets_align_and_parse/separate/r...
1recA_lib-2../notebooks/input_files/recA_lib-2_report.txt../notebooks/input_files/recA_lib-2_ccs.fastq_temp_check_Targets_align_and_parse/separate/r...
\n", "
" ], "text/plain": [ " name report \\\n", "0 recA_lib-1 ../notebooks/input_files/recA_lib-1_report.txt \n", "1 recA_lib-2 ../notebooks/input_files/recA_lib-2_report.txt \n", "\n", " fastq \\\n", "0 ../notebooks/input_files/recA_lib-1_ccs.fastq \n", "1 ../notebooks/input_files/recA_lib-2_ccs.fastq \n", "\n", " alignments \n", "0 _temp_check_Targets_align_and_parse/separate/r... \n", "1 _temp_check_Targets_align_and_parse/separate/r... " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pacbio_runs = pacbio_runs.assign(alignments=lambda x: outdir_separate +\n", " x['name'] + '_alignments.sam')\n", "\n", "pacbio_runs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now use the mapper to actually align the FASTQ queries to the target:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "for tup in pacbio_runs.itertuples(index=False):\n", " targets.align(queryfile=tup.fastq,\n", " alignmentfile=tup.alignments,\n", " mapper=mapper)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Parse the alignments:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "readstats = []\n", "aligned = {targetname: [] for targetname in targets.target_names}\n", "filtered = {targetname: [] for targetname in targets.target_names}\n", "\n", "for run in pacbio_runs.itertuples():\n", " \n", " run_readstats, run_aligned, run_filtered = targets.parse_alignment(run.alignments)\n", " \n", " # when concatenating add the run name to keep track of runs for results\n", " readstats.append(run_readstats\n", " .assign(name=run.name)\n", " )\n", " for targetname in targets.target_names:\n", " aligned[targetname].append(run_aligned[targetname]\n", " .assign(name=run.name)\n", " )\n", " filtered[targetname].append(run_filtered[targetname]\n", " .assign(name=run.name)\n", " )\n", " \n", "# now concatenate the data frames for each run\n", "readstats = pd.concat(readstats, ignore_index=True, sort=False)\n", "for targetname in targets.target_names:\n", " aligned[targetname] = pd.concat(aligned[targetname], ignore_index=True, sort=False)\n", " filtered[targetname] = pd.concat(filtered[targetname], ignore_index=True, sort=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First lets look at the read stats:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
categorycountname
0aligned RecA_PacBio_amplicon123recA_lib-1
1filtered RecA_PacBio_amplicon16recA_lib-1
2unmapped0recA_lib-1
3aligned RecA_PacBio_amplicon112recA_lib-2
4filtered RecA_PacBio_amplicon12recA_lib-2
5unmapped0recA_lib-2
\n", "
" ], "text/plain": [ " category count name\n", "0 aligned RecA_PacBio_amplicon 123 recA_lib-1\n", "1 filtered RecA_PacBio_amplicon 16 recA_lib-1\n", "2 unmapped 0 recA_lib-1\n", "3 aligned RecA_PacBio_amplicon 112 recA_lib-2\n", "4 filtered RecA_PacBio_amplicon 12 recA_lib-2\n", "5 unmapped 0 recA_lib-2" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "readstats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now look at the information on the filtered reads.\n", "This is a bigger data frame, so we just look at the first few lines for the first target (of which there is only one anyway):" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_namefilter_reasonname
0m54228_180801_171631/4194459/ccsspacer mutation_nt_countrecA_lib-1
1m54228_180801_171631/4325806/ccsbarcode mutation_nt_countrecA_lib-1
2m54228_180801_171631/4391313/ccstermini3 mutation_nt_countrecA_lib-1
3m54228_180801_171631/4391375/ccsgene clip3recA_lib-1
4m54228_180801_171631/4391467/ccsgene clip3recA_lib-1
\n", "
" ], "text/plain": [ " query_name filter_reason name\n", "0 m54228_180801_171631/4194459/ccs spacer mutation_nt_count recA_lib-1\n", "1 m54228_180801_171631/4325806/ccs barcode mutation_nt_count recA_lib-1\n", "2 m54228_180801_171631/4391313/ccs termini3 mutation_nt_count recA_lib-1\n", "3 m54228_180801_171631/4391375/ccs gene clip3 recA_lib-1\n", "4 m54228_180801_171631/4391467/ccs gene clip3 recA_lib-1" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filtered[targets.target_names[0]].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally, we can look at the information for the validly aligned (not filtered) reads.\n", "First just look at the first few entries in the data frame for the first target:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_namequery_clip5query_clip3gene_mutationsgene_accuracybarcode_sequencebarcode_accuracyvariant_tag5_sequencevariant_tag3_sequencename
0m54228_180801_171631/4391577/ccs10C100A T102A G658C C659T del840to8400.999455AAGATACACTCGAAATCT1.0GCrecA_lib-1
1m54228_180801_171631/4915465/ccs00C142G G144T T329A A738G A946T C947A1.000000AAATATCATCGCGGCCAG1.0TTrecA_lib-1
2m54228_180801_171631/4981392/ccs00C142G G144T T329A A738G A946T C947A1.000000AAATATCATCGCGGCCAG1.0GCrecA_lib-1
3m54228_180801_171631/6029553/ccs00T83A G84A A106T T107A G108A ins693G G862T C863...0.999940CTAATAGTAGTTTTCCAG1.0GCrecA_lib-1
4m54228_180801_171631/6488565/ccs00A254C G255T A466T T467G C468T C940G G942A0.999967TATTTATACCCATGAGTG1.0ATrecA_lib-1
\n", "
" ], "text/plain": [ " query_name query_clip5 query_clip3 \\\n", "0 m54228_180801_171631/4391577/ccs 1 0 \n", "1 m54228_180801_171631/4915465/ccs 0 0 \n", "2 m54228_180801_171631/4981392/ccs 0 0 \n", "3 m54228_180801_171631/6029553/ccs 0 0 \n", "4 m54228_180801_171631/6488565/ccs 0 0 \n", "\n", " gene_mutations gene_accuracy \\\n", "0 C100A T102A G658C C659T del840to840 0.999455 \n", "1 C142G G144T T329A A738G A946T C947A 1.000000 \n", "2 C142G G144T T329A A738G A946T C947A 1.000000 \n", "3 T83A G84A A106T T107A G108A ins693G G862T C863... 0.999940 \n", "4 A254C G255T A466T T467G C468T C940G G942A 0.999967 \n", "\n", " barcode_sequence barcode_accuracy variant_tag5_sequence \\\n", "0 AAGATACACTCGAAATCT 1.0 G \n", "1 AAATATCATCGCGGCCAG 1.0 T \n", "2 AAATATCATCGCGGCCAG 1.0 G \n", "3 CTAATAGTAGTTTTCCAG 1.0 G \n", "4 TATTTATACCCATGAGTG 1.0 A \n", "\n", " variant_tag3_sequence name \n", "0 C recA_lib-1 \n", "1 T recA_lib-1 \n", "2 C recA_lib-1 \n", "3 C recA_lib-1 \n", "4 T recA_lib-1 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "aligned[targets.target_names[0]].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Align and parse together\n", "Use `Targets.align_and_parse`:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "readstats2, aligned2, filtered2 = targets.align_and_parse(\n", " df=pacbio_runs,\n", " mapper=mapper,\n", " outdir=outdir_together,\n", " name_col='name',\n", " queryfile_col='fastq',\n", " overwrite=True,\n", " ncpus=2,\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make sure the data frames are the same:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "pd.testing.assert_frame_equal(readstats, readstats2, check_like=True)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "pd.testing.assert_frame_equal(aligned[targets.target_names[0]],\n", " aligned2[targets.target_names[0]],\n", " check_like=True)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "pd.testing.assert_frame_equal(filtered[targets.target_names[0]],\n", " filtered2[targets.target_names[0]],\n", " check_like=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" }, "toc": { "nav_menu": {}, "number_sections": false, "sideBar": true, "skip_h1_title": false, "toc_cell": true, "toc_position": {}, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 4 }