{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Test consistency of alignment parsing by `Targets`\n", "This Jupyter notebook is designed to test parsing of alignments by `Targets`.\n", "It does not test correctness by looking at actual alignments, but does test that results are internally consistent from the different parsing methods." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set up\n", "Import Python modules:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import contextlib\n", "import os\n", "import re\n", "import tempfile\n", "\n", "import pandas as pd\n", "from pandas.testing import assert_frame_equal\n", "\n", "import alignparse.minimap2\n", "import alignparse.targets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set up targets with the feature parsing used for the examples and also returning **all** `cs` tags and feature clipping:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "targetfile = '../notebooks/input_files/recA_amplicon.gb'\n", "feature_parse_specs_file = '../notebooks/input_files/recA_feature_parse_specs.yaml'\n", "\n", "targets = alignparse.targets.Targets(seqsfile=targetfile,\n", " feature_parse_specs=feature_parse_specs_file)\n", "\n", "# now `Targets` object that returns `cs` and clipping for all features with no filter\n", "parse_d = {}\n", "for target in targets.targets:\n", " parse_d[target.name] = {'query_clip5': None, 'query_clip3': None}\n", " for featurename in target.feature_names:\n", " parse_d[target.name][featurename] = {}\n", " parse_d[target.name][featurename]['return'] = ['cs', 'clip5', 'clip3']\n", " parse_d[target.name][featurename]['filter'] = {'mutation_nt_count': None,\n", " 'mutation_op_count': None,\n", " 'clip5': None,\n", " 'clip3': None}\n", "targets_all = alignparse.targets.Targets(seqsfile=targetfile,\n", " feature_parse_specs=parse_d)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Check parsing consistency\n", "Confirm that `targets_all.parse_alignment` returns `cs`, `clip5`, and `clip3` for all features, just like the private `targets._parse_cs_alignment` method.\n", "Do this for both returned data frames and ones written to CSV files.\n", "\n", "First, define a function to assert that data frames are **not** equal as [here](https://stackoverflow.com/a/38778401):" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def assert_frame_not_equal(*args, **kwargs):\n", " try:\n", " assert_frame_equal(*args, **kwargs)\n", " except AssertionError:\n", " # frames are not equal\n", " pass\n", " else:\n", " # frames are equal\n", " raise AssertionError('frames unexpectedly equal')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now do the tests:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "queryfile = '../notebooks/input_files/recA_lib-1_ccs.fastq'\n", "\n", "mapper = alignparse.minimap2.Mapper(alignparse.minimap2.OPTIONS_CODON_DMS)\n", "\n", "with contextlib.ExitStack() as stack:\n", " # make alignment SAM files\n", " alignmentfile = stack.enter_context(\n", " tempfile.NamedTemporaryFile('r+', suffix='.sam'))\n", " alignmentfile_all = stack.enter_context(\n", " tempfile.NamedTemporaryFile('r+', suffix='.sam'))\n", " targets.align(queryfile, alignmentfile.name, mapper)\n", " targets_all.align(queryfile, alignmentfile_all.name, mapper)\n", " \n", " # directly get data frames from alignment SAM files\n", " alignments_cs = targets._parse_alignment_cs(alignmentfile.name)\n", " alignments_all_cs = targets_all._parse_alignment_cs(alignmentfile_all.name)\n", " alignments = targets.parse_alignment(alignmentfile.name)\n", " alignments_all = targets_all.parse_alignment(alignmentfile_all.name)\n", " \n", " # make sure the expected data frames are identical\n", " for targetname in targets.target_names:\n", " assert_frame_equal(alignments_cs[targetname],\n", " alignments_all_cs[targetname])\n", " assert_frame_equal(alignments_cs[targetname],\n", " alignments_all[1][targetname])\n", " assert_frame_not_equal(alignments[1][targetname],\n", " alignments_all[1][targetname])\n", " \n", " # make sure the filtering is as expected\n", " for targetname in targets.target_names:\n", " assert len(alignments_all[2][targetname]) == 0\n", " assert len(alignments[2][targetname]) > 0\n", " \n", " # make sure the read stats are as expected\n", " for a_tup in [alignments_all, alignments]:\n", " read_stats, aligned, filtered = a_tup\n", " for targetname in targets.target_names:\n", " aligned_df = aligned[targetname]\n", " filtered_df = filtered[targetname]\n", " assert len(filtered_df) == (read_stats\n", " .set_index('category')\n", " .at[f\"filtered {targetname}\", 'count']\n", " )\n", " assert len(aligned_df) == (read_stats\n", " .set_index('category')\n", " .at[f\"aligned {targetname}\", 'count']\n", " )\n", " \n", " # now get the alignments into CSV files\n", " csv_dir = stack.enter_context(tempfile.TemporaryDirectory())\n", " csv_dir_all = stack.enter_context(tempfile.TemporaryDirectory())\n", " alignments_csv = targets.parse_alignment(alignmentfile.name,\n", " to_csv=True,\n", " csv_dir=csv_dir)\n", " alignments_all_csv = targets_all.parse_alignment(alignmentfile.name,\n", " to_csv=True,\n", " csv_dir=csv_dir_all)\n", " \n", " # make sure the CSV files match those returned directly as data frames\n", " for a, a_csv in [(alignments, alignments_csv), (alignments_all, alignments_all_csv)]:\n", " read_stats, aligned, filtered = a\n", " read_stats_csv, aligned_csv, filtered_csv = a_csv\n", " assert_frame_equal(read_stats, read_stats_csv)\n", " for targetname in targets.target_names:\n", " assert_frame_equal(aligned[targetname],\n", " pd.read_csv(aligned_csv[targetname]).fillna(''))\n", " assert_frame_equal(filtered[targetname],\n", " pd.read_csv(filtered_csv[targetname]).fillna('')) " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": false, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 2 }