{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Test alignment parsing by `Targets`\n", "This Jupyter notebook is designed to test parsing of alignments by `Targets`." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import Python modules" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2022-03-04T14:07:42.471434Z", "iopub.status.busy": "2022-03-04T14:07:42.471136Z", "iopub.status.idle": "2022-03-04T14:07:42.474863Z", "shell.execute_reply": "2022-03-04T14:07:42.474311Z", "shell.execute_reply.started": "2022-03-04T14:07:42.471410Z" } }, "outputs": [], "source": [ "import contextlib\n", "import os\n", "import random\n", "import re\n", "import tempfile\n", "\n", "from IPython.display import display\n", "\n", "import pandas as pd\n", "\n", "import yaml\n", "\n", "import alignparse.minimap2\n", "import alignparse.targets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set up `Targets`\n", "Read in the RecA target used in the notebook examples:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "execution": { "iopub.execute_input": "2022-03-04T14:07:42.475807Z", "iopub.status.busy": "2022-03-04T14:07:42.475652Z", "iopub.status.idle": "2022-03-04T14:07:42.746899Z", "shell.execute_reply": "2022-03-04T14:07:42.745936Z", "shell.execute_reply.started": "2022-03-04T14:07:42.475787Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "There is just one target: ['RecA_PacBio_amplicon']\n", "It has the following features: ['termini5', 'gene', 'spacer', 'barcode', 'termini3', 'variant_tag5', 'variant_tag3']\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAV0AAADSCAYAAADpGRMOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAx60lEQVR4nO3dd3hUxfrA8e/sbnqhBOkgHaQm9BJ6E2kWVATFdi0/vFf0KiooHA4gIIq9XNu9KIpIk6YIiFQREISA9F6kt0B6dnd+f+wSQgopbHYTeD/Psw/ZPXNm3tmwb+bMOXtGaa0RQgjhHRZfByCEEDcTSbpCCOFFknSFEMKLJOkKIYQXSdIVQggvkqQrhBBeJElXFBlKqcpKqTillNXXseSGUmqyUmqs++e2Sqldvo5J+J4k3SJGKXVQKZXoTj4n3B/sUA/UW1Up5VRKfZyHfbRSKt4dy99KqbevJyEqpR5RSjnc9cUppfYrpf7v8nat9WGtdajW2pHfNnxFa71Ka13b13EI35OkWzT11lqHApFAFDDMA3UOAs4D/ZVSAXnYr5E7ls7AAOCJ64zjd3diDQX6AROVUlHXWacQhYYk3SJMa30CWIQr+aKUaqmUWqOUuqCUilFKdbhcVilVUin1P6XUMaXUeaXUnAzVDQJeA1KB3vmIZSewCqivlKqulPpVKXVWKXVGKfWtUqp4ulgqKaVmK6VOu8t8mE2dfwI7gNvc+1Vxj65t7ufllVLzlFLnlFJ7lVI5JnylVHOl1O/u9+i4UupDpZR/uu1aKTVYKbVHKXVJKTXG3Z/flVIXlVLTL5dXSnVQSh1VSg139/OgUmpgNu12UEodzek9UEpZlFKvKaUOKaVOKaW+VkoVy9D/h5VSh91tvppTn0XhIkm3CFNKVQR6AHuVUhWAH4GxQEngRWCWUuoWd/EpQDBQDygNvJOunrZARWAaMB1XAs5rLHWBtsAmQAHjgfK4EmYlYJS7nBVYABwCqgAV3O1mVWczoBawIZtmvwOOutvpB4xTSnXOIVQH8DxQCmiFa4Q+OEOZ24EmQEvgJeAzYKC7H/WBB9KVLeuuqwLwMPCZUuqa0wg5vAePuB8dgWpAKJDxj1I0UNsd+0il1G059FkUJlpreRShB3AQiAMuARpYChQHXgamZCi7CFciKAc4gRLZ1PkFMMf9cytco93SuYhFAxdxTUvsw5XwLVmUuxPYlK7+04Ati3KPAHbggruPGvgAUO7tVdyv2XAlQAcQlm7/8cDkPL6fzwE/ZOhTm3TPNwIvp3s+CXjX/XMHd7wh6bZPB0a4f54MjE1X9mgu3oOlwOB0z2u7fx+2dP2vmG77eqC/r/9fyiP3DxnpFk13aq3DcH2Q6+Aaad0K3Os+bL6glLqAa0RUDleCOqe1Pp+xIqVUEHAv8C2A1vp34DCu+dncaKy1LqG1rq61fk1r7VRKlVZKTXOfXLsIfOOOEXcsh7TW9mzqW6u1Lq5dc7plcY3Mx2VRrry7T5fSvXYI16gxW0qpWkqpBe6TkBfddZfKUOxkup8Ts3ie/sTlea11fIYYyl8rBq79HpR315G+PhtQJt1rJ9L9nJAhHlHISdItwrTWK3CNpt4CjuAa6RZP9wjRWk9wbyuZfl41nbuAcOBjdyI6gStx5XmKIZ3xuEZkDbXW4cCDuKYccMdS+fK8bA79OwnMIus55mO4+hSW7rXKwN85VPsJsBOo6Y5teLrY8qOEUiokQwzHctjnWu/BMVx/QNPXZ+fqxC+KMEm6Rd+7QFdgNdBbKdVdKWVVSgW6T95U1FofBxbiSqwllFJ+Sql27v0fBv4LNMB1Qi4SaANEKqUa5DOmMFzTAxfcc81D021bDxwHJiilQtxxtsmqEqVUBK4/CtsybtNaHwHWAOPddTQEHsc9Ys8htotAnFKqDvB/OZTPDVMp5e+eG+8FzMih/LXeg++A55XrEr5QXCPx769xZCCKGEm6RZzW+jTwNa65yb64Rm6ncY2mhnLld/wQrrnBncAp4Dl3QuyMa47yRLrHRuBnXAk5P0ygMRCL6+Te7HTxOnCNXGvgmsY4Ctyfbt9Wyn2dLq4rF04D/8qmnQdwzXMeA34ADK31khxiexHX1Mkl4HPg+7x0LAsncM1pH8OV8J/Wris5spXDe/BfXCc9VwIHgCSy778ogi6foBBC5JFyXZL3jda6oo9DEUWIjHSFEMKLJOmKbCnX/QLisnr4OrZrUUotzCbu4b6OTQiZXhBCCC+Ska4QQniRJF0hhPAiSbpCCOFFknSFEMKLJOkKIYQXSdIVQggvkqQrhBBeJElXCCG8SJKuEEJ4kSRdIYTwIkm6QgjhRZJ0hRDCiyTpCiGEF0nSFUIIL5KkK4QQXiRJVwghvEiSrhBCeJEkXSGE8CJJukII4UWSdIUQwosk6QohhBdJ0hVCCC+SpCuEEF4kSVcIIbxIkq4QQniRJF0hhPAiSbpCCOFFknSFEMKLJOkKIYQXSdIVQggvkqQrhBBeJElXCCG8SJKuEEJ4kSRdIYTwIkm6QgjhRZJ0hRDCiyTpCiGEF0nSFUIIL5KkK4QQXiRJVwghvEiSrhBCeJEkXSGE8CJJukII4UWSdIUQwosk6QohhBdJ0hVCCC+y+ToAIUTRZZqmzepnG2tRlkdSU1JKA8rXMWVktVnjLVbbotTk5GcNw/jb1/EorbWvYxBCFFHjJoz/OqJsmXta394luETpW7BYCtfBs9aahEtxbFu/0bFt/cYT9tTUGoZhJPkypsL1DgkhigzTNMMddsd93frfExxRtkyhS7gASilCwsNo3qWDtfgtEWFAN1/HVPjeJSFEUVErNDwsOSAw0Ndx5ErFalVDgUa+jkOSrhAiv2wWm63IzE9abVYL4OfrOCTpCiFuSE6nk8J4zkquXhBCeExiQgJDHv4Hu7dtx+bnR/XatXh48JOMHPIiDRpHsj1mKzabjXcmf06turdx6sQJBj8wiLiLl0hOSqJzzx68NnEcACkpKUwYPpLlPy/GYrVya7WqfPnDdAA+njiJH2f+gN1up2yF8rz5+ceULluWSaPGcHDvPuLj4jm0bz+zVv5C8RIlfPmWZCJJVwjhMcsXLSH2/HmWb98MwIXz59kes4UdW7Yy5v1JtGrfjulfTWHIoMdZuGEN4cWL89X82YSEhpKamsqA7r1Y9vNiOt7ejQ/HT+Tw/gP8/Oda/P39OXfmDACzvpnKwb37mL92JRaLha8++YzRL7zMh99+BcDalatZ9OdaSpYq5au34Zok6QohPKZuowbs3bmb4c8MoVWHdnTp2QOAKjWq06p9OwD6PTSQl598hksXL2K1WhkzdBgb1qwFrTl14iTbNsfQ8fZu/LJgISMnTcDf3x8gLYkunvcjWzZspHvjlgA47HbCioWnxdDpjtsLbcIFmdMVQnjQrdWqsWLHZtp17cyqX36lS6NmJCclZ1v+07ffI/b8BRasW8UvWzZw+529SU5yXUab3Xys1pohr73Cks3rWbJ5Pb/+9Sdzf1uetj0kNNSjffI0SbpCCI85dvQoVquV2+/sg/nOm5w7fYYL585xcO8+1q1aDcAPU6dRp0F9wsLDuXghltLlyhIYGMjxv/9m0dwFaXV17X0HX7z7ISkpKQBp0wvd+vTkq48/48L58wAkJyezLWaLl3uafzK9IITwmJ1btzHuldcAcDoc/HPYUMqUL0e9yEbM+W46xnNDsVqtvPf1lwA8/uxgnrp3IN2iWlC+UkWiO3dIq+uZV4YyftgIukU2x8/fnyo1qvP5zO/o99BAzp05S7/2XV3tOJ0MGvwU9Ro19Hp/80O+BiyEyBfTNFsWv6XUz/cO/kexa5Vbs3wFY14cxsINa7wVWpb+XLGajctXjzEMY6Qv45DpBSFE/hWhQVthCVWSrhAivy4mJybmOEXZukN7n49yARLi4pKBC76OQ5KuECK/dqampCYdP3TY13HkKDkxiQPbdzqAxb6OReZ0hRD5ZppmL5ufbXr1BvUspcqWCVCF7U5jWhN/8ZJj16YtSSkpyV+9Omz4M74OSZKuEOK6mKZZTSl1r83f7zallEeuiDpy+HDDSpUrX/91YBptt9uPOR2O+cBvhmH4POFJ0hVCFDpKqVFa61G+jqMgFLJjASGEuLHJSFcIUSDGjhnzhM3P9lJycko1iuAAz2azxlktlh+TU1KfNQzjlKfqlaQrhPC4sWPHPB0SEjSpb59OwZUqlcFqtfo6pDzRWnPxYhy/r91ij4nZdSglJbWOYRh2T9Rd5P76CCEKP5vN9tKdfTsFV6lSvsglXHCtrVasWBi3d29jCwsLuQVo56m6JekKITzKNE1LcnJKlUqVyvo6FI+oWqVCIB5cW02SrhDC0yxKkWl1YKutHHFx8T4K6Wp5icXm4bXVJOkKIYoMh8Ph6xCumyRdIYTXTJr0CdHRvbmtbjSzZl+5d+6DDw2meYvuNIrsyN33PMr58xcAWL58DY2bdOHZIa/SunVPFi78lR07dtP99vuJjOpEo8iOfPW1a920vXsP0LXrvURGdaJps678/POvafXP/uFH6taLJjq6N6+//s5VMa1b9yedO99Ds+bdaNa8Gz/++EuBvgdyP10hhNdYLBZWr57Prl17iW7bh7bRLSlduhTvvjOGUqUiABgxYgITJ37E+PGvArB16w4++nAC77/3Ona7nfoN2jNmzCvc2683AGfPngPgoYee4R9PPMjjjw1g+/ZddOh4F9v+WonW8NRTQ1m9ah61a9fgzTc/SovnwoVYBj/zMgvmf0O5cmU4fvwkLVr2YEvMMooXv+YdK/NNkq4Qwmsee+wBAGrXrkFUVAPWrttIn97dmTJlBlO/m01KSirx8QnUrFktbZ+aNavSqlVTAHbt2ofdbk9LuAARESW5dCmOzTHbePSR/gDUrVubyEb1Wbv2TzSaqKgG1K5dA4AnnniQV4aNBWDNmg0cOHCYnr0GptWnlGLv3gM0bRpZIO+BJF0hhG9ojUKxatVa/vPp16xeNY9bbinF1O9m88Xn36QVCw0NSbdL9uumZUUphdPpvEYImoYNbmP58jn560M+yJyuEMJrJk+eBsCePfvZHPMXLVo05sKFixQrFkZEREmSk5OZ/L9p2e5fp04NbDYbM2bOT3vt7NlzhIeHEdmoXtr87s6de4jZso0WLaJo1bIpmzdvZc+e/QB8+eXUtH1bt27Knr0HWLbst7TX/vhjc7ZJ3BMk6QohvCYgIIC2bfvQp+8gPvlkIqVLl6JHj05Uq1aFuvXa0rPnQKKiGmS7v81m44fZ/+OzT7+mUWRHohp35qeFSwGYMuUjpn47i8ioTjz40GC+mvwBt9xSitKlS/Gf/7xJ3zsHER3dG5vtypc1SpQozpwfvmLMmElENe5MvfptMUe/VaBJV74GLITwKNM0bUqR8tqrTylfx+IJS5b8bl+7bsurhmFM9ER9MtIVQnjcjTSW0x7ujSRdIYSnOSwWS2piYpKv4/CIuEsJKXhwbTVJukIIjzIMQ/v721ZtjtlV5Me78fGJ7Nl72Aos8VSdcsmYEMLjkpJS/rV8+Ybfjhw54V+lSoUQ1+0Lig6tITb2kmPT5p3JWuv3DcM44Km65USaEKJAmKZZCujn7+/XTCkVkJd9jxw50rBSpUo5rpGW23L54LTb7X87HM45hmGs82TFknSFEIVObtdIK4prqRWtMb8QQhRxBT7SNU2zUkCA/3t2u6O7w+EILtDGCpDFolL9/fx+T0pOedEwjD98HY8QRdGYMWMesvn5v5qSklwTrYvqoE/b/PzPgJ5iT00dZhhGSl52LtCka5pmkJ+fbW+LJvXLNGtc1xoWGoxSRe96aa01SckpbN95gEVL18Sl2h1RhmHs9XVcQhQlo0ePHhAYFPx5h153BZeteCs2W9E8j+90Orlw9gxrly1KPH3875+GvfJKv7zsX9B/aXqUvqVkaOf2zazhYSFFMuGC66YZQYEBNImsQ+NGdfytVsujvo5JiKLGzz9gWLsefYMrVqleZBMuuG5PWfKW0nS98/4gh93RyzTNiDztX1CBuTWsXqVCWAG34VW3Virn7+/v19rXcQhR1NhTU2qVrVjJ12F4jJ+/P8VKlkwC6uZlvwJNukopP6vVkml4O2rsJFJS8jQNkmcjR7/J9zPm5ljuP59/zTvvfwbA8pVrCC5ZncgWXYls0ZUWbXtlKm+1WVEeXC9JiJuF1tpisV49wp00dlSB54I3R49k7szvcyz39ef/4bMPXKtK/BWzme6tGtO1RSQdm9TjpWeeJDk5OdM+Vld/8pQPfDLGN19/mxefexp/f/9c72O32/N0SDJ65NBclXv6iUFXPa97Wy02/LYw1+0IIfLv7XEmTz/3YoHmgqEjR+eq3KAnnk77uXqt2sxfsRZ/f3+cTidPDbyXb778lMcHP5vrdrPj9aT7zHPDAWjdsS8Wi2LejMmMHv8OW7buICk5mY7tWvP2RAOr1UqHbv1o3bIJ6/7YRGBgAPfe3Yup38+heLFwtvy1gwrly/LB22MZOnwMe/YeoFmTSL753wcopXjkiedo2rgR//y/Rxk1dhK7du8j9uIl9h84RPVqtzLj288IDg5i1NhJxMXF89aEkd5+K4S4qQ1/7hkA+nZqjVIWJs+cxzvjRrPjry0kJyfRul1HjDfexmq10q97B5q0aM2mDesICAik1933Mmf6VMKLFWfHX1soW74CYyd9wJjhQzmwbw+RTZrxwX+/QSnFc08+QqOopjz6f/9k0thR7Nuzi0uxsRw6uJ9bq1bns29nEBQczKSxo4iPj2Pk+LcICgpKizM1NZWkxEQsyjMTA16/ZOOjd8cBsGbZXDavW8Lo8e/Qvm1L1q/+kc3rFnPq9Bn++9WVmxj/tX0Xi+ZP5ccfpgDwx8YY3n7DYGfMSoKCAhnwyDNMnfwR2zctZ+u2nSxdtirLdjf8GcPUyR+yY/MKUlPtfDttdpbldu/ZT+NW3WnRthdffTPdw70XQlw27l3XWmVzf13DknWbeWfcaFq2bc+Pq9azeO1mzpw+xbSv/ptWftf2v5g6bxFTfvgRgJiNf2BMeJuVm3cSGBjEM48M4KPJU1n+53Z2btvKqmVLs2w35s8NfDh5Kis27cBuT2X2tG+zLHfi2DG6toikQaVShISFMfDxJz3Sb5+fQpz342LWb9jMpPc+BSAhIZGKFcqlbR9w311XHUq0adWUihXLAxDVqD5VKleiWLFwABo1uI29+w7SpVO7TO1079IhbaG5Fs2i2Lf/UKYyjSMbcGTPHxQrFs6Bg4fpckd/KpQvm2V9QgjPWvzTPDZvXM+n700CIDExgXIVKqZtv+v+AVflgqYt21C+omt7/UZRVLq1CuHFXJ/x2xo04uC+vbTr1CVTOx26dKdY8eIARDVtwaED+7KMp2z58ixZt5mE+Hj+9diDLJw7m7739r/ufvo86WqtmTP9S6pVvTXL7aGhV3+fIjDgyle4rVYrgYFXP7fbHVnWk7FcVredCw+/cqFF1SqVubN3d377fYMkXSG8QGvNl9/P4daq1bLcHhwSetXzgMDAtJ+tVmum5w67Pct6MpZLSkq8ZlzBISH06Xc/s6d965Gk65NvhISFhRIbexGAPj27MeGtj3A4XMnyzJlzHDh42Bdhcfz4ybRlOs6dO8/ipSuJbFjPJ7EIcTMIDQvjYmwsAN3u6MNHb01IywXnzpzh8EGP3dwrTw4d2J92VUVKSgqLFsylTr3slxHKC5+MdF8Y8hSdetxHUFAg82dOZtybH9CoeVeUUgQE+PPuRJOqVSp7Pa5Zc37ik8+/xs/Pht3uYNCAfvTt3d3rcQhxs3jq2Re4745OBAYGMXnWfD6YOI6uLRqhlMLfPwDzzXepXKWq1+PasHYNH7/9Bspiwelw0DK6Pc8NG+GRugv0a8CjR49+vUN04+HtWjcusDa8bfe+w8z9cflvQ196JdrXsQhRlIwePTr1keeH24ryt9EymvP157GnTxy72zCMX3O7T4FPL9xwd47UmhutS0J4zQ2WEPKTDQo06WqtY+PjEwv26yZeFp+QhHbqs76OQ4iixmK1JiYlJvg6DI9KSkiwkMf10wp6pLt428599qSkzF+fK4qcTid/xuyMT0pO+cHXsQhR1Fit1iW7tm5y+joOTzl9/G8SE+I1EJOX/Qp6ciXGbndM+fjLmQ9GNawdFBYWkvlGDEWABpKSkvXWbXvjL1yM2wJMy2kfIcTVUpKTh25Zt6btmRPHgytUqRZitRbNuV2tnZw7fSp5z7YYp8PueMwwjKyvU82GN25iroBoq9XS22azlVNQYHn38OEjDStX9vx6SRq0w+G4YLc7FgM/G4aR6uk2hLgZmKZZArjHz9+/pVKWwOzKHTl8uGGlypVzXiMtm3K53T8/tNYOuz11t3Y6pxuGsSev+99Qa6QVxfWShBCZXe8aaYU5F2SZdE3TrAJ0A8K9HdD12LRpU7eoqKjFXmgqEVgBbDMM48b5qyVELpmmaQPaAw0ogGnK3H6WsyvnxVyQHQ38DSw0DCM2/Yarkq5pmsrPZnsdxfO1qlZ2hIYE+xelOViH02m1Wix5ml/JK/f8rn33gcPa4XAuTElN7W8YRtbfNxTiBmSaZjk/P781YaFhEbdWqORvtVk9fkLe6XRaLbn4LGdXLrf7FxSttT5z9kzy0ePHrA6H4y7DMNL+AGT8C9Xcz882ZPCgfoEhwUEUUd74arNfqt3Ol9Pm3n7y9LkHgCleaFOIQiHA3/+jRvUaVuwU3b6gz4Tl9rOcXTlfL3zpf+Tvo0ybO3OmaZoRl88FXRWU1WK5s3H92gFFOOF6jZ/NRsuoBiGBAQEP+DoWIbzFNE1lt9t7NI9qWjQvPfCyShUqUjy8mAZaXX7tqqRrs1krFAsLtXo9siKqWFgISlEu55JC3DD8HE5nQGhIiK/jKDLCQsMAbrn8/Kqkq5SyFNUVe33B/V7JGyZuKkopOXmcB5YMOdUrcx7Hjp+gY688LQ2fycFDR/jsf9/kWG7zlr+YPnvedbV1uT1bycpERndNe5w9d+666xVCZHbs+HG69b3juuo4ePgQX6RbaSI7MVu3MHNO1ivH5MXxEydo07k9zdu3pkl0CwY8+hDnL5zPcb8CT7p2u53y5cqybMHM66rn4OEjfDY562U10tu8dRvTf5h/XW1dVrxYOJtXL0l7RJQs6ZF6hRBXuHJEORbP/em66jl0+DD//WpyjuVitm5hlgeSbqmICJbMX8j6FWvYuHodFcpXYPxbb+S4X45Jd8zEd3h+mJH2/Oy5c0RUqcePi36hVZfeREV3o0GrzkybeWW58w49+zHcHE/n3vfR94FHOXjoCKWq1k/bPvAf/6Rp+x40aNWZuwY+zvnzFwBYvmoNkdFdeWrISzRs3YVGbbqwY5frCx/PvPgq23ftJjK6K/0eeiLLWM+eO8fIcW/xy4rVREZ35dmXRlyzPYBXR0+gRmQbWnTqxcsjX6dp+x45vmlCiCvGvfUGQ199Je352XNnKV+jMgsX/0z77p1o0aENTaJbMH32lYFX1z49GDFmFLff2Yt+D97PwcOHqFDzyuoxDz/1OK07taNJdAvue+iBtBHkitWraN6+Nc/8+1matm1Js3at2LlrJwDPvfRvduzeSfP2rXngkQezjPXsubOMnvA6v65YTvP2rfn3K0Ov2R7AyLEmdZs2om3Xjrw6agSt3SvJ+Pn5ERzsWtnG4XAQFx+HxZLzODbHEg8/cB/TZs3D7l76YuqMOfTt2Z3WzZuyetEcNq1ezC9zp/HiiNFXJbO/duxi0Q9T+XFG5qup3ntjNBtWLGTr70upV6cWb7z7cdq2bTt28/Rjg9iy5hfuu7M3Y998D4CP3nqdurVrsXn1EmZO+TzLWCNKlmT08Bfp0j6azauX8P7EMddsb/7CxSxY9Asxvy3h91/msWff1Xepv3gpjqbte9Ck3e28+d4n3Ejf3hPCUx7sP4AZP8xMyxHfz5xBrx49adm8Bb/+tIR1y3/jp9nzGTby1auS2fad21kwcw5zps3KVOekcW+w5teVbFy9jtvq3Mak995Jt98OnnjkcTasWss9fe9m/NtvAvDuxLe5rVYd1q9Yw3eTs56KjCgZwchXXqVT+w6sX7GGtye8ec32fvz5JxYu/pk/VqxhxaKl7N2feT215u1bU7FWVfbu38fwF1/JtD2jHJNu5UoVqFu7Jj8tdt2jd/K303l04P2cPnuWfoOepH7LTnS/ewDnzl9g194rAQ3od1e2a9N//d0MmrS7nQatOjN1xhw2b92Wtq12zepENXKNils2a8y+Awdz7EROsmtv2ao13Hdnb0JCgrFYLDw84N60fcqVLc3RHRvYsGIhC2d9w6x5P/Ll199ddyxC3GgqV6xEnVp1+HnJIgCmTPuWQQMe5MyZMzzwyIM0btOc3vfeybkL59m958qtCu6/575sc8S3339Hq05taRLdgu9nzSDmr61p22rVqElkw0YANG/ajAMH9l93H7Jrb8XqldzT925CQkKwWCw82H9Apn3Xr1jD4Z37qFOzNp9P/jLHtnI1p/vIwPv4auoM/tq+k9iLl2jbugX/9/wwOkS3YuvvS9m8egkVy5cj/S0cMy4oedmqNev45Muv+Xn2t2z9fSljR7xEUvKV/TIuPJndQpO5da32tNZkd7VGQEAApW8pBUDpW0ox8L67+W3dH9cVixA3qoceGMg306aybcd2Yi9eJLpVG/714vO0i27LxtXrWL9iDRXKl7/qs57dZWerf/+Nz/73JfOmz2bj6nWMGj6CpOQrC8kGBly9sOT15ohrtac15OaCLj8/Px7sP4Cp03O+AWGuku49fXqycs1a3vrgPzwy0DUavBB7kSqVK6GUYsmvK9m7/2BuquJCbCzFwsOJKFmC5ORk/jsld3dJDA8LI/bixVyWu5Sr9jq2bcOMOQtISEjE6XQyZdqVOadTp8+Qmuq6mVhCQiLzflpMZANZpFKIrNzVuy+rf/+Ndz58j4f6DwQg9mIst1aqjFKKX5b9yr79uRuRxsbGUiwsnIiSESQnJ/PVt7n7wmd4WBixl3KbI66Uu1Z77aPbMnveHBISEnA6nVcl1SN/HyUuLg5w3Wt7zoK51Kubc47IVdINDg6i7x3dmTJtFoP6u5LuhFHDefG10bTq0puZcxfQsP5tuamKHl07Ub3qrdRp2o4e9zxI40a5W2GzYf3bqF2zOvVbdsr2RBpA5/bRxCck0KhNF559acQ12+tzRze6d+5AozZd6NT7PmpUq0ox9zLsq39fT1Tb7jRq04WmHXoQ1bA+/3zy0VzFKsTNJjg4mF49ejJ1+jQG9nd9SXPMCJNXjNdo370TP8yfQ4N69XOoxaV7l25Uq1qVhi0b0+e+u4lq1ChX+zWoV59aNWrQuE3zbE+kAXRs34GEhHiatWvFv18Zes32evXoSdeOnWnWvjXd7+xJ9arVCA933Qds9549dOrZjaZtW9K0bUuOnzjJ2+Mn5hjnVTe8eWPC+G+6RDcf2KRh7hLojeDSpTjCwkJxOp38418vUr5sGcaOeDlX+x48cozpC36JeenlVyILNkohCgfTNP2VUokv//Pfvr6vgddcunSJsLAwnE4nTw95hnJly2G+OjLX+0+fO+vi/sMHHzMMYxZkuOGN1jrZ4bxhVtPIlUFPD+HgoSMkJiXRJLIhLw0ZnOt93e+V3NBc3EwcWmt1rfMhN5rHBz/JoSOHSUxMJCoyihf+9Vye9nc4HRpIuxPhVUk3OSX1z0NHj/dvHlkv67NghUjT9j2wO66+o2LLpo35z7s5X5yc3g/f5ny2MTtHjp102B2ODfmuQIgixjAMx/jx4/8+fvJExfJlC/dtR1p3apcpRzRv2owPJ72Xp3qmT8n/VUtOp5MTp075A9svv5bxeo2Zuw8cHrchZrtuWLem8vfzy3djBW3DioU+a9vhcLD7wGHWbNySnJpq/8xngQjhA06H450FSxaO7tO9Z0iZW0oX2hHvml9X+rT9S3GXWLn2t2TQMemX9cm0coRpmg0DA/w/SU5JbW6xKA2F4+YW2um0qhxuSpybMh6IRDmd2hrg77c9KTnlecMwfinY9oQoXEzTVDabbahFqeftdvstBfGZy+1nOWO5y8+9kwuuEZfWSillt1qtc1JSUv4v/eoR2a6R5l6Oo9BMM0yaNGnYCy+8MP56y3hIoixOKQSYphkEePyQOLef5YzlLj/3Yi7IjhOIz2o5ryKzMGVuFporzIvRCSFyL78LU15+XphzwU1z2YcQQhQGRSLp2izqbotr1VEhhMiSzWL9AgjMsaCPFYmk69B87oS7fR2HEKLwcmjn40Chv+l1kUi6FJ04hRC+VehzRaEP0M2ecxEhhCj8Oa3QB+gmSVcIkRuFPqcV+gDdJOkKIXKj0Oe0Qh+gmyRdIURuFPqcVugDdPPZ1/mEEEVKoc9phT5AIYTIgxO+DiAnRSXpZr16nRBCXO1nXweQE0m6QogbSaE//yNJVwhxI5Gk6yGSdIUQuSFJ10Mk6QohcqPQX+lUJJKZgm1KUcrXcQghCr0kXweQkyKRdDX00xqrr+MQQhRqTYGtvg4iJ0Uj6Wp93NcxCCEKN631Rl/HkBtFZU5XCCFuCJJ0hRDCiyTpCiGEFxWqOd3wYLXIz0pQfvcvFsytEWGqU8bX7Q7OXEzkn1rrY1ntZwsMedwaEPhwftsVQlyhHY7Y1PjYIVrr/VltV362ewn0/9c1K/GzKqXUGK11pkvAlFLFwsOs39tsluDAQEuKUqoFgWETQam0Qlb/uOvtR0EpVEk3wI/otx4l+DqrqZzxhY17sX+xhHZKqdu11hsybrf6+fe8tWO/tmUaRl9n00KICwe3O3dMf6+dUqq31nplpgJWS2ca12hLZPWsK9AaflyfyrGzPyul7tJaZ0ygZSwW1W6CWTPo2aE7U4DG3FK1JY37+APgtMPCd+xKqVpa692e7d31K1RJN8gfe7dIz9fbLRJbwypEPPs5K2xW9Zjdob/PWKZEtQaUb9Hd840LcZMp36K7pWTNqPA14x/72ernP8SRmvJ5pkIVSpFt0gVoUNWP/y2KZtPejUqpTlrrv9NvDgy02O+7uyzG63s5fjKlAuGlHVRvfqXA+WNW1s/8FOjoqX55yk0zp9uzKcwZTnCxEP4XHKDGKpXuUEQI4VFlG3egy9uLggKKlXrXFhTygVIqb9fZ26zwjx6B3NGiGv62GKVUVFbFunUuZVWKupk2NOmr8A9qppTqkb8eFJybJukCNLgVlo4mqGoZngsNZI5SKt/zx0KIawuvVJNuHy4LLl6l7mO2oNBflFLheapAKejVwsY/ekTgb1utLJbeGYt07RRhLRZui8y0r9UPugwOwS/wM6WUX747UQBuqqQLUKY4LBhBSIf6dA0NZKNSqryvYxJXHP1tAQufbsPiZzuzffq7TO9VhtTEeM7u2siyYXexZEhXlgzpyrE/lgAQf/Iwcwbcxtavx7H42c4sfKo1p7etS6vv+B+/sHRoL5YM6crSF+7g7M5MU/qiAAWElaDjhDnBldr2bWULDIlRSlXLcyVNa8FL9wUT5D9N+dteSL8pulVx4hMclXBmccuFqk2hTI0SWP2G5LsDBeCmS7oAgX7w6WCCBvegZpA/W5z21OK+jklA0oXTbPjwRaJHTKHb+0ux+gcCkBofy8aPXqLl0P/Q9b0lRBvfsPHDoaTExQKQcvEcEXWa0u39pdTt/2+2TB4DQNzxg2yf9jbtzO/o+t4Smj77Nr+/8aTP+nezstj8aPqvSQENHh5e2RoQ9Cdal81zJdXKwahBwRQPHY2f9fXLLxcv5keNasF2Ei5kzmVKQef/C0GpUdfVAQ8rVCfStCbEW20pBc/1wVajHBGDP43vGH/qqLeaFtk4u2sjJao3IKyCazBUtesAYr4wOL9vC/EnD7PKeOBKYaWIO36AgPCS2IJCKN+8GwARdZoS8+UoAE78uYy4EwdZ9nLftN2cDjtJ508RWKK01/olQClFzd7/sISVr15s5esP9c15jyyUCgfjoWDGfXe3tl9KOyfTvXNE4O6Z5zJfWwYQUQnqdLjeK6I8qlAlXW+7lAiTf8Xp72dxBhSLuKnfi0JB66sutUz/erEqden0xtxMm+JPHsbiF5D2XFksON2HmlpryjbuRIsXPiywkEXu2ZMT2fvTZCfK4gDyN8+65284HQslLBpQAEeOJmmHpVjW5bUTju905jPkAlGopheUIt5bbR06BV0NEmIO8nWqNWyBLUDOqflaRO0mnN+7hUvHDgBw8JdpAJSo3pC4Y/s5tWV1Wtlzuzehtb5mfWWjOnDiz1+JPbTzqv2E9yWePcEvz3dPOLVl9VxwfpXnCrSGJX86+WT+BVLt/S/nCq01y1adSyW0VHKW+21fBpfO7Lq+6D3rphzdrd0FD79HYkoqw5PtvO8fqmb5OiYBgSVK0+SZiaw2B+IfXpLyzbthsfkRFFGO6BFfE/Pf0Wz6bAROeyqhZW8leuSUa9YXVqEaLV74mD/efx5HchJOeyql6jajZK0srz4SBeTc3hhWjuif6EhJnOhIThxNgN8nearA4YRvlybz+47jpNg7Av6XN+3dn0BSktOOf3DmQ6SUBFj+ZSIpCY9ddyc86KZLulNXokd8S3xyKvc4nHoxuM6wisKhbONOVIruA8CBJd9RslYUymKhZK0oOk74IVP5kDKVuXPqjmyfl23cgbKNOxR43CJrR1bPY/07zyY4UpIe1E7nDwAq0D+n3a5ITIYP5iZw8OQmklN7aq1jlVK1Lm9evuo8yqJ2oFTma3XXTk9FOxdorddl2uZDN03SdTjBnEbK1JWcSUyhs9Z6Z857CW/bM/9zjq6ej9Nhxz+sBE3/NcnXIYl80FqzbeqbqbtmfxLrSE7sprXO+7zO2Yvw5owELsZPJyn1Ca11pvXPFi4+cyk+3hEDGb4gcf4YbF6Qij3lufz2oaDcFEn3UiI8/gEJmw+wJSGZnlrrc76OSWSt7v3PU/f+530dhrgO9uRE1k16JvHkphX7HMkJXbXWJ/JcyYET8PasRFJSDVIdk3QWE/h2u5M16y8EAJkHUMs+S0DrCdnd5MqXVE4nI7wpwE+lNK1Bgqfr3XscW3wyM+KTeFJrnZpxu39osemBJUr3CipZJsXTbQtxs4k7ftCWEhe72J4YN0BrnWnNMhXg9xHFQh8jIizrk18aOHDCRqp9gHY652XaX6kafn5qe2SDsKSdu+NPX4pzTCS4+LtEVHLV53QoTu1PIDWpalbt+1qhSrpKqdvJ76Uk1xYLrMrqr6W73SpAgwJoV4ibUTyw7BqftwpA4xzq2KG13pvN/hagK64TanuAE0DbDMW2aK0P5SlqLylUSVcIIW50heo6XSGEuNEVqqRrmqZPvxjvy/albWn7Zmnf1333tUKVdAFf/zJ82b60LW3fLO37uu8+VdiSrhBC3NAk6QohhBcVtqT72U3cvrQtbd8s7fu67z4ll4wJIYQXFbaRrhBC3NAKxb0XTNP8L9ALOGUYRn1vtWGa5ijgCeC0u9hwwzB+Mk0zApgJNAMmG4bxz+tsOxBYCQTges9nGoZhmKZZEvgeqAIcBO4zDOO8p9t3x2AFNgB/G4bRy4t9Lw58AdTH9QXPx4BdeKHfpmnWdrdzWTVgJFAc7/R9iLsdBXxuGMa7BfU7z+b/d3ZtVQF24Po9AKw1DONp9z6vA4OAEoZhhF5H228CvYEUYB/wqGEYF9zbhgGPAw7gWcMwFuW37aKosIx0JwO3+6iNdwzDiHQ/fnK/lgSMAF70UNvJQCfDMBoBkcDtpmm2BF4BlhqGURNY6n5eEO0DDMH1QUvPG31/D/jZMIw6QCN3DF7pt2EYuy73D2gCJACX7w9ZoH03TbM+roTbHFe/e5mmWZOC6/tkMv//zq4tgH3p+v90utfnu2O+3raXAPUNw2gI7AaGAZimWRfoD9Rz7/Oxe0CQ37aLnEKRdA3DWAkU6J2/8tKGYRjxhmGsxvVB8ETb2jCMOPdTP/dDA32By3fR/wq4syDaN02zItAT14gzp1g91rZpmuFAO+BLd90p7tGOV/qdQWdciSbb7+N7uP3bcI0gEwzDsAMrgLsooL5n8/87y7ZyqGetYRjHr7dtwzAWu/sNsBaomC6maYZhJBuGcQDYizvR5qftoqhQTC/42D9N0xyE69D7BcMwzhdEI+6/5huBGsBHhmGsM02zzOX/ZIZhHDdNs6BWS3wXeAkIy/B6Qfe9Gq5D+P+ZptkIV/+HAN7qd3r9ge/SPS/ovv8FvO6eNkgE7nC35c2+X6utqqZpbgIuAq8ZhrGqAON4jCvTPBVwJeHLjrpfu2kUipGuD30CVMd1yH8cKLA7ZhuG4XAf5lYEmrsPPwucaZqX59o2Ztjkjb7bcN1N6hPDMKJw3X3qlWvv4nmmafoDfYAZ7pcKvO+GYewA3sB1mP0zEANkugm3jxwHKrt/J/8GprqPSjzONM1XcfX7W/dLWaw8yk11CdVNnXQNwzjpToZO4HO8MJ/kPrxejms+66RpmuUA3P+eKoAm2wB9TNM8CEwDOpmm+Y2X+n4UOGoYxuXlUmbiSsLe6Hd6PYA/DcM4Cd77vRuG8aVhGI0Nw2iH6/B7D97te5ZtuQ/tz7p/3ojrRFetbGvJJ9M0H8Z1gm2gYRiXE+tRoFK6YhWBQnej8YJ0Uyfdy/8h3e7CdUhYEO3c4j6Lj2maQUAXXHe7nwc87C72MJB5jfHrZBjGMMMwKhqGUQXXIfavhmE86I2+G4ZxAjjivooAXPOq2/FCvzN4gHRTC178vZd2/1sZuNsdgzf7nmVb7v+PVvfP1YCawH5PNmya5u3Ay0AfwzDSL0wwD+hvmmaAaZpV3W2v92TbhV2h+HKEaZrfAR2AUsBJwDAM48uCbsP9PBLX4c1B4KnLc2DukWE4rhslXwC6GYaxPZ9tN8R1IsOK6w/ddMMwRrvn+6YDlYHDwL2GYZzzdPvp4ugAvOi+ZGwK3ul7JK4TeP64PtiP4n4P8EK/TdMMBo4A1QzDiHW/5q2+rwIigFTg34ZhLC2o33k2/7/nZNWWaZr3AKNxHfY7cH3e5rvrmQgMAMrjGoF+YRjGqHy0PQzXJZJn3cXSX5b2Kq55XjvwnGEYC/PbdlFUKJKuEELcLG7q6QUhhPA2SbpCCOFFknSFEMKLJOkKIYQXSdIVQggvkqQrhBBeJElXCCG8SJKuEEJ40f8D4oAR1vLXRyIAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "targetfile = \"../notebooks/input_files/recA_amplicon.gb\"\n", "feature_parse_specs_file = \"../notebooks/input_files/recA_feature_parse_specs.yaml\"\n", "\n", "with open(feature_parse_specs_file) as f:\n", " feature_parse_specs = yaml.safe_load(f)\n", "# we can't get accuracies for the dummy alignments used here\n", "feature_parse_specs[\"RecA_PacBio_amplicon\"][\"gene\"][\"return\"] = \"mutations\"\n", "feature_parse_specs[\"RecA_PacBio_amplicon\"][\"barcode\"][\"return\"] = \"sequence\"\n", "\n", "targets = alignparse.targets.Targets(\n", " seqsfile=targetfile,\n", " feature_parse_specs=feature_parse_specs,\n", " allow_extra_features=True,\n", ")\n", "\n", "print(f\"There is just one target: {targets.target_names}\")\n", "target = targets.targets[0]\n", "\n", "print(f\"It has the following features: {target.feature_names}\")\n", "\n", "_ = targets.plot()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Location of features in 0-indexed scheme:" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "execution": { "iopub.execute_input": "2022-03-04T14:07:42.748717Z", "iopub.status.busy": "2022-03-04T14:07:42.748498Z", "iopub.status.idle": "2022-03-04T14:07:42.755838Z", "shell.execute_reply": "2022-03-04T14:07:42.755217Z", "shell.execute_reply.started": "2022-03-04T14:07:42.748691Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "termini5 0 147\n", "gene 147 1206\n", "spacer 1206 1285\n", "barcode 1285 1303\n", "termini3 1303 1342\n", "variant_tag5 32 33\n", "variant_tag3 1310 1311\n" ] } ], "source": [ "for feature in target.features:\n", " print(feature.name, feature.start, feature.end)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Make some queries\n", "We make some query sequences that we will align against the target.\n", "The name of each query is a description of all ways that it differs from target:" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "execution": { "iopub.execute_input": "2022-03-04T14:07:42.757362Z", "iopub.status.busy": "2022-03-04T14:07:42.756902Z", "iopub.status.idle": "2022-03-04T14:07:42.777081Z", "shell.execute_reply": "2022-03-04T14:07:42.776358Z", "shell.execute_reply.started": "2022-03-04T14:07:42.757338Z" } }, "outputs": [], "source": [ "random.seed(1) # for reproducible output\n", "nts = \"ACGT\" # valid nucleotides\n", "\n", "\n", "def randseq(length):\n", " \"\"\"Random nucleotide sequence of given length.\"\"\"\n", " return \"\".join(random.choices(nts, k=length))\n", "\n", "\n", "def mutseq(wtseq):\n", " \"\"\"Random mutant sequence from a wildtype sequence.\"\"\"\n", " return \"\".join(random.choice([nt for nt in nts if nt != wt]) for wt in wtseq)\n", "\n", "\n", "def get_wt_query(\n", " target, ambiguous_features=(\"barcode\", \"variant_tag5\", \"variant_tag3\")\n", "):\n", " \"\"\"`(description, seq)` for wildtype query, ambiguous features assigned.\"\"\"\n", " seq = target.seq\n", " description = []\n", " for fname in ambiguous_features:\n", " f = target.get_feature(fname)\n", " fseq = randseq(f.length)\n", " seq = seq[: f.start] + fseq + seq[f.end :]\n", " description.append(f\"{f.name}={fseq}\")\n", " assert len(seq) == len(target.seq)\n", " assert re.fullmatch(f\"[{nts}]+\", seq)\n", " return \",\".join(description), seq\n", "\n", "\n", "queries = []\n", "\n", "# get two random (unmappable) queries\n", "queries.append((\"unmapped_1\", randseq(target.length)))\n", "queries.append((\"unmapped_2\", randseq(target.length // 2)))\n", "\n", "# get a fully wildtype query\n", "queries.append(get_wt_query(target))\n", "\n", "# query with query clipping at both ends and a codon substitution in gene\n", "desc, seq = get_wt_query(target)\n", "# add substitution to gene\n", "gene = target.get_feature(\"gene\")\n", "geneseq = gene.seq\n", "mutstart = 6\n", "mutlength = 3\n", "mutcodon = mutseq(gene.seq[mutstart : mutstart + mutlength])\n", "sub_desc = []\n", "for i in range(mutlength):\n", " wt = geneseq[i + mutstart]\n", " mut = mutcodon[i]\n", " sub_desc.append(f\"{wt}{mutstart + i}{mut}\")\n", " geneseq = geneseq[: mutstart + i] + mut + geneseq[mutstart + i + 1 :]\n", "seq = seq[: gene.start] + geneseq + seq[gene.end :]\n", "desc += f\",gene_{'-'.join(sub_desc)}\"\n", "# add clipping\n", "desc += \",query_clip5=9\"\n", "seq = randseq(9) + seq\n", "desc += \",query_clip3=7\"\n", "seq += randseq(7)\n", "# add to list of queries\n", "queries.append((desc, seq))\n", "\n", "# query with a deletion in gene and in insertion in spacer\n", "desc, seq = get_wt_query(target)\n", "delstart = 7\n", "dellength = 15\n", "geneseq = gene.seq[:delstart] + gene.seq[delstart + dellength :]\n", "spacer = target.get_feature(\"spacer\")\n", "insstart = 4\n", "ins = \"G\"\n", "spacerseq = spacer.seq[:insstart] + ins + spacer.seq[insstart:]\n", "seq = (\n", " seq[: gene.start]\n", " + geneseq\n", " + seq[gene.end : spacer.start]\n", " + spacerseq\n", " + seq[spacer.end :]\n", ")\n", "desc += f\",gene_del{delstart}to{delstart + dellength}\"\n", "desc += f\",spacer_ins{insstart}{ins}\"\n", "queries.append((desc, seq))\n", "\n", "# query with deletion spanning gene and spacer\n", "desc, seq = get_wt_query(target)\n", "delstartgene = gene.length - 7\n", "delendgene = delstartgene + 7\n", "geneseq = gene.seq[:delstartgene]\n", "delendspacer = 9\n", "spacerseq = spacer.seq[delendspacer:]\n", "seq = seq[: gene.start] + geneseq + spacerseq + seq[spacer.end :]\n", "desc += f\",gene_del{delstartgene}to{delendgene},spacer_del1to{delendspacer}\"\n", "queries.append((desc, seq))\n", "\n", "# query with insertion at boundary of gene and spacer, note how\n", "# it is assigned as being at end of gene\n", "desc, seq = get_wt_query(target)\n", "ins = randseq(14)\n", "seq = seq[: gene.end] + ins + seq[spacer.start :]\n", "desc += f\",gene_ins{gene.length}{ins}\"\n", "queries.append((desc, seq))\n", "\n", "# query with target clipping at both ends\n", "desc, seq = get_wt_query(target)\n", "target_clip5 = target.get_feature(\"termini5\").length + 4\n", "target_clip3 = 2\n", "seq = seq[target_clip5:-target_clip3]\n", "desc = desc.split(\",\")\n", "desc = \",\".join([desc[0], \"variant_tag5=None\", desc[2]])\n", "desc += f\",target_clip5={target_clip5},target_clip3={target_clip3}\"\n", "desc += \",termini5=None,gene=,termini3=\"\n", "queries.append((desc, seq))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Align the queries to the target" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "execution": { "iopub.execute_input": "2022-03-04T14:07:42.778034Z", "iopub.status.busy": "2022-03-04T14:07:42.777875Z", "iopub.status.idle": "2022-03-04T14:07:42.864942Z", "shell.execute_reply": "2022-03-04T14:07:42.863625Z", "shell.execute_reply.started": "2022-03-04T14:07:42.778016Z" } }, "outputs": [], "source": [ "# NBVAL_IGNORE_OUTPUT\n", "\n", "mapper = alignparse.minimap2.Mapper(alignparse.minimap2.OPTIONS_CODON_DMS)\n", "\n", "with contextlib.ExitStack() as stack:\n", "\n", " # write queries to FASTA file\n", " queryfile = tempfile.NamedTemporaryFile(\"r+\", suffix=\".fasta\")\n", " queryfile.write(\"\\n\".join(f\">{desc}\\n{seq}\" for desc, seq in queries))\n", " queryfile.flush()\n", "\n", " # align the queries to the target\n", " alignmentfile = tempfile.NamedTemporaryFile(\"r+\", suffix=\".sam\")\n", " targets.align(queryfile.name, alignmentfile.name, mapper)\n", "\n", " # get the alignment cs data frame\n", " alignments_cs = targets._parse_alignment_cs(alignmentfile.name)\n", "\n", " # get the alignment results\n", " readstats, aligned, filtered = targets.parse_alignment(alignmentfile.name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Check results of `Targets._parse_alignment_cs`\n", "We now check if the feature-specific `cs` strings returned by `Targets._parse_alignment_cs` are correct.\n", "\n", "We expect there to be alignments for just our one target plus the number of unmapped reads:" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "execution": { "iopub.execute_input": "2022-03-04T14:07:42.867479Z", "iopub.status.busy": "2022-03-04T14:07:42.866951Z", "iopub.status.idle": "2022-03-04T14:07:42.873080Z", "shell.execute_reply": "2022-03-04T14:07:42.872473Z", "shell.execute_reply.started": "2022-03-04T14:07:42.867426Z" } }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted(alignments_cs.keys()) == [target.name, \"unmapped\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make sure the number of unmapped queries equals the number we passed in:" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "execution": { "iopub.execute_input": "2022-03-04T14:07:42.874284Z", "iopub.status.busy": "2022-03-04T14:07:42.873999Z", "iopub.status.idle": "2022-03-04T14:07:42.879065Z", "shell.execute_reply": "2022-03-04T14:07:42.878525Z", "shell.execute_reply.started": "2022-03-04T14:07:42.874260Z" } }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unmapped_queries = [(desc, query) for desc, query in queries if \"unmapped\" in desc]\n", "len(unmapped_queries) == alignments_cs[\"unmapped\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make sure we get the expected queries mapped to our target:" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "execution": { "iopub.execute_input": "2022-03-04T14:07:42.880222Z", "iopub.status.busy": "2022-03-04T14:07:42.879950Z", "iopub.status.idle": "2022-03-04T14:07:42.885409Z", "shell.execute_reply": "2022-03-04T14:07:42.884588Z", "shell.execute_reply.started": "2022-03-04T14:07:42.880199Z" } }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mapped_queries = [tup for tup in queries if tup not in unmapped_queries]\n", "len(mapped_queries) == len(alignments_cs[target.name])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we look manually at whether we got the expected `cs` strings for features:" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "execution": { "iopub.execute_input": "2022-03-04T14:07:42.887478Z", "iopub.status.busy": "2022-03-04T14:07:42.886821Z", "iopub.status.idle": "2022-03-04T14:07:42.904610Z", "shell.execute_reply": "2022-03-04T14:07:42.903673Z", "shell.execute_reply.started": "2022-03-04T14:07:42.887430Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_namequery_clip5query_clip3termini5_cstermini5_clip5termini5_clip3gene_csgene_clip5gene_clip3spacer_csspacer_clip5spacer_clip3barcode_csbarcode_clip5barcode_clip3termini3_cstermini3_clip5termini3_clip3variant_tag5_csvariant_tag5_clip5variant_tag5_clip3variant_tag3_csvariant_tag3_clip5variant_tag3_clip3
0barcode=CTAACTCGAGCTGCAATG,variant_tag5=A,variant_tag3=T00:32*na:11400:105900:7900*nc*nt*na*na*nc*nt*nc*ng*na*ng*nc*nt*ng*nc*na*na*nt*ng00:7*nt:3100*na00*nt00
1barcode=ACTAAAACCCACCGCGCT,variant_tag5=A,variant_tag3=C,gene_A6G-...97:32*na:11400:6*ag*ta*cg:105000:7900*na*nc*nt*na*na*na*na*nc*nc*nc*na*nc*nc*ng*nc*ng*nc*nt00:7*nc:3100*na00*nc00
2barcode=GTTACGACACGTTATTTC,variant_tag5=T,variant_tag3=C,gene_del7...00:32*nt:11400:7-tcgacgaaaacaaac:103700:4+g:7500*ng*nt*nt*na*nc*ng*na*nc*na*nc*ng*nt*nt*na*nt*nt*nt*nc00:7*nc:3100*nt00*nc00
3barcode=TAGTCTGTATCATAACTT,variant_tag5=A,variant_tag3=T,gene_del1...00:32*na:11400:1052-agatttt00-taatcgtct:7000*nt*na*ng*nt*nc*nt*ng*nt*na*nt*nc*na*nt*na*na*nc*nt*nt00:7*nt:3100*na00*nt00
4barcode=TACCCTTGCTAAGTACGC,variant_tag5=T,variant_tag3=G,gene_ins1...00:32*nt:11400:1059+gggataggagacta00:7900*nt*na*nc*nc*nc*nt*nt*ng*nc*nt*na*na*ng*nt*na*nc*ng*nc00:7*ng:3100*nt00*ng00
5barcode=GCGTCACAAGTGCCGAAT,variant_tag5=None,variant_tag3=T,target...001470:105540:7900*ng*nc*ng*nt*nc*na*nc*na*na*ng*nt*ng*nc*nc*ng*na*na*nt00:7*nt:290210*nt00
\n", "
" ], "text/plain": [ " query_name \\\n", "0 barcode=CTAACTCGAGCTGCAATG,variant_tag5=A,variant_tag3=T \n", "1 barcode=ACTAAAACCCACCGCGCT,variant_tag5=A,variant_tag3=C,gene_A6G-... \n", "2 barcode=GTTACGACACGTTATTTC,variant_tag5=T,variant_tag3=C,gene_del7... \n", "3 barcode=TAGTCTGTATCATAACTT,variant_tag5=A,variant_tag3=T,gene_del1... \n", "4 barcode=TACCCTTGCTAAGTACGC,variant_tag5=T,variant_tag3=G,gene_ins1... \n", "5 barcode=GCGTCACAAGTGCCGAAT,variant_tag5=None,variant_tag3=T,target... \n", "\n", " query_clip5 query_clip3 termini5_cs termini5_clip5 termini5_clip3 \\\n", "0 0 0 :32*na:114 0 0 \n", "1 9 7 :32*na:114 0 0 \n", "2 0 0 :32*nt:114 0 0 \n", "3 0 0 :32*na:114 0 0 \n", "4 0 0 :32*nt:114 0 0 \n", "5 0 0 147 0 \n", "\n", " gene_cs gene_clip5 gene_clip3 spacer_cs \\\n", "0 :1059 0 0 :79 \n", "1 :6*ag*ta*cg:1050 0 0 :79 \n", "2 :7-tcgacgaaaacaaac:1037 0 0 :4+g:75 \n", "3 :1052-agatttt 0 0 -taatcgtct:70 \n", "4 :1059+gggataggagacta 0 0 :79 \n", "5 :1055 4 0 :79 \n", "\n", " spacer_clip5 spacer_clip3 \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "5 0 0 \n", "\n", " barcode_cs barcode_clip5 \\\n", "0 *nc*nt*na*na*nc*nt*nc*ng*na*ng*nc*nt*ng*nc*na*na*nt*ng 0 \n", "1 *na*nc*nt*na*na*na*na*nc*nc*nc*na*nc*nc*ng*nc*ng*nc*nt 0 \n", "2 *ng*nt*nt*na*nc*ng*na*nc*na*nc*ng*nt*nt*na*nt*nt*nt*nc 0 \n", "3 *nt*na*ng*nt*nc*nt*ng*nt*na*nt*nc*na*nt*na*na*nc*nt*nt 0 \n", "4 *nt*na*nc*nc*nc*nt*nt*ng*nc*nt*na*na*ng*nt*na*nc*ng*nc 0 \n", "5 *ng*nc*ng*nt*nc*na*nc*na*na*ng*nt*ng*nc*nc*ng*na*na*nt 0 \n", "\n", " barcode_clip3 termini3_cs termini3_clip5 termini3_clip3 variant_tag5_cs \\\n", "0 0 :7*nt:31 0 0 *na \n", "1 0 :7*nc:31 0 0 *na \n", "2 0 :7*nc:31 0 0 *nt \n", "3 0 :7*nt:31 0 0 *na \n", "4 0 :7*ng:31 0 0 *nt \n", "5 0 :7*nt:29 0 2 \n", "\n", " variant_tag5_clip5 variant_tag5_clip3 variant_tag3_cs variant_tag3_clip5 \\\n", "0 0 0 *nt 0 \n", "1 0 0 *nc 0 \n", "2 0 0 *nc 0 \n", "3 0 0 *nt 0 \n", "4 0 0 *ng 0 \n", "5 1 0 *nt 0 \n", "\n", " variant_tag3_clip3 \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "5 0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "with pd.option_context(\"display.max_colwidth\", 70, \"display.max_columns\", None):\n", " display(alignments_cs[target.name])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Check results of `Targets.parse_alignment`\n", "Now check the results or `Targets.parse_alignment`.\n", "\n", "First make sure the read stats are correct.\n", "Here are those read stats:" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "execution": { "iopub.execute_input": "2022-03-04T14:07:42.906546Z", "iopub.status.busy": "2022-03-04T14:07:42.906130Z", "iopub.status.idle": "2022-03-04T14:07:42.914040Z", "shell.execute_reply": "2022-03-04T14:07:42.913463Z", "shell.execute_reply.started": "2022-03-04T14:07:42.906500Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
categorycount
0aligned RecA_PacBio_amplicon3
1filtered RecA_PacBio_amplicon3
2unmapped2
\n", "
" ], "text/plain": [ " category count\n", "0 aligned RecA_PacBio_amplicon 3\n", "1 filtered RecA_PacBio_amplicon 3\n", "2 unmapped 2" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "readstats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make sure they are correct for number of unmapped reads:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "execution": { "iopub.execute_input": "2022-03-04T14:07:42.914953Z", "iopub.status.busy": "2022-03-04T14:07:42.914798Z", "iopub.status.idle": "2022-03-04T14:07:42.920141Z", "shell.execute_reply": "2022-03-04T14:07:42.919584Z", "shell.execute_reply.started": "2022-03-04T14:07:42.914934Z" } }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "readstats.set_index(\"category\")[\"count\"].to_dict()[\"unmapped\"] == len(unmapped_queries)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now look at the filtered reads and make sure that they are what we expect:" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "execution": { "iopub.execute_input": "2022-03-04T14:07:42.921183Z", "iopub.status.busy": "2022-03-04T14:07:42.920959Z", "iopub.status.idle": "2022-03-04T14:07:42.929637Z", "shell.execute_reply": "2022-03-04T14:07:42.928962Z", "shell.execute_reply.started": "2022-03-04T14:07:42.921163Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_namefilter_reason
0barcode=ACTAAAACCCACCGCGCT,variant_tag5=A,variant_tag3=C,gene_A6G-...query_clip5
1barcode=TAGTCTGTATCATAACTT,variant_tag5=A,variant_tag3=T,gene_del1...spacer mutation_nt_count
2barcode=GCGTCACAAGTGCCGAAT,variant_tag5=None,variant_tag3=T,target...termini5 clip5
\n", "
" ], "text/plain": [ " query_name \\\n", "0 barcode=ACTAAAACCCACCGCGCT,variant_tag5=A,variant_tag3=C,gene_A6G-... \n", "1 barcode=TAGTCTGTATCATAACTT,variant_tag5=A,variant_tag3=T,gene_del1... \n", "2 barcode=GCGTCACAAGTGCCGAAT,variant_tag5=None,variant_tag3=T,target... \n", "\n", " filter_reason \n", "0 query_clip5 \n", "1 spacer mutation_nt_count \n", "2 termini5 clip5 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "with pd.option_context(\"display.max_colwidth\", 70, \"display.max_columns\", None):\n", " display(filtered[target.name])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check that the aligned reads are also what we expect:" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "execution": { "iopub.execute_input": "2022-03-04T14:07:42.930815Z", "iopub.status.busy": "2022-03-04T14:07:42.930544Z", "iopub.status.idle": "2022-03-04T14:07:42.939882Z", "shell.execute_reply": "2022-03-04T14:07:42.939309Z", "shell.execute_reply.started": "2022-03-04T14:07:42.930792Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_namequery_clip5query_clip3gene_mutationsbarcode_sequencevariant_tag5_sequencevariant_tag3_sequence
0barcode=CTAACTCGAGCTGCAATG,variant_tag5=A,variant_tag3=T00CTAACTCGAGCTGCAATGAT
1barcode=GTTACGACACGTTATTTC,variant_tag5=T,variant_tag3=C,gene_del7...00del8to22GTTACGACACGTTATTTCTC
2barcode=TACCCTTGCTAAGTACGC,variant_tag5=T,variant_tag3=G,gene_ins1...00ins1060GGGATAGGAGACTATACCCTTGCTAAGTACGCTG
\n", "
" ], "text/plain": [ " query_name \\\n", "0 barcode=CTAACTCGAGCTGCAATG,variant_tag5=A,variant_tag3=T \n", "1 barcode=GTTACGACACGTTATTTC,variant_tag5=T,variant_tag3=C,gene_del7... \n", "2 barcode=TACCCTTGCTAAGTACGC,variant_tag5=T,variant_tag3=G,gene_ins1... \n", "\n", " query_clip5 query_clip3 gene_mutations barcode_sequence \\\n", "0 0 0 CTAACTCGAGCTGCAATG \n", "1 0 0 del8to22 GTTACGACACGTTATTTC \n", "2 0 0 ins1060GGGATAGGAGACTA TACCCTTGCTAAGTACGC \n", "\n", " variant_tag5_sequence variant_tag3_sequence \n", "0 A T \n", "1 T C \n", "2 T G " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "with pd.option_context(\"display.max_colwidth\", 70, \"display.max_columns\", None):\n", " display(aligned[target.name])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "toc": { "nav_menu": {}, "number_sections": false, "sideBar": true, "skip_h1_title": false, "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 4 }