{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Test alignment parsing by `Targets`\n", "This Jupyter notebook is designed to test parsing of alignments by `Targets`." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import Python modules" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import contextlib\n", "import os\n", "import random\n", "import re\n", "import tempfile\n", "\n", "from IPython.display import display\n", "\n", "import pandas as pd\n", "\n", "import yaml\n", "\n", "import alignparse.minimap2\n", "import alignparse.targets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set up `Targets`\n", "Read in the RecA target used in the notebook examples:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "There is just one target: ['RecA_PacBio_amplicon']\n", "It has the following features: ['termini5', 'gene', 'spacer', 'barcode', 'termini3', 'variant_tag5', 'variant_tag3']\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAV0AAADSCAYAAADpGRMOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3dd3xTZRfA8d9J0l02yJQhoiCzlL1XARnuhYq4X/fALUqIAxRFcPs6UVEQBAeiyN57CsgQmb7KHoWuNMnz/pG0FmhpS7MK5/v55AO5ee5zzs04vXnuzX3EGINSSqngsIQ6AaWUOpdo0VVKqSDSoquUUkGkRVcppYJIi65SSgWRFl2llAoiLbqq2BCR6iJyXESsoc6lIERktIi85Pt/exHZHOqcVOhp0S1GRGSHiKT5Cs8e34c63k99jxYRl4hULmD7ISKS6cvliIgsEpHWRczhVhFx+/o8LiLbROTerMeNMbuMMfHGGHdR4oSCMWa+MebiUOehQk+LbvHT1xgTDzQBEoBnitqhiMQBVwNHgZsLseo3vlwqAAuASSIiRUxnsa+wxvtyGi4iCUXsU6mwoUW3mDLG7AF+xVt8EZEoEXldRHaJyF4R+UBEYrLai8jlIrJGRJJF5E8R6Zmju6uBI8ALwIAzyCUT+ByoBJQTkdoiMktEDorIARH5SkRK58jlfBGZJCL7fW3eyaPf1cBGoJ5vvZoiYkTE5rtfRUR+FJFDIrJVRO7KL1cRaSEii3175/+IyDsiEpnjcSMi94nIHyJyTERe9G3PIt9zNz6rvYh0EpG/RORZ33buEJGb8ojbSUT+yu85EBGLiDwnIjtFZJ+IfCEipU7a/gG+1/mAiAzKb5tVeNGiW0yJSDXgUmCrb9ErwEV4i/CFQFVgsK9tC+AL4AmgNNAB2JGjuwHAWGAcUFdEEguZSxRwK7DbGHMAEGAYUAVvwTwfGOJrawV+AnYCNX15jsuj3+a+bVqRR+hxwF++ONcAQ0WkSz7puoFHgfJAa6ArcN9JbXoAiUAr4EngQ7zfAM4HGgD9crSt5OurKt7n8UMROe0wQj7Pwa2+W2fgAiAeOPmPUjvgYl/ug0WkXj7brMKJMUZvxeSGt1AeB44BBpiJt4gKkALUztG2NbDd9///AiPz6LM64AGa+O7/CrxZgFyGAE68e8j7gFlAYh5trwBW58hrP2DLpd2tgMvXZ9Y2vg2I7/GavmU2vAXQDZTIsf4wYHQhn9NHgO9y3DdA2xz3VwJP5bg/Ahjl+38nX75xOR4fDzzv+/9o4KUcbf8qwHMwE7gvx/2LgUzfNmdtf7Ucjy8Dbgj1e1NvBb/pnm7xc4UxpgTeD3FdvHtZFYBYYKXva/MRYKpvOXgL1J959Ncf2GiMWeO7/xVwo4hEFCCX8caY0saY84wxXYwxKwFEpKKIjBOR/4lIMjDGl2dWLjuNMa48+lzi67ME3r3I+sDQXNpVAQ4ZY47lWLYT715jnkTkIhH5yXcgMtnXd/mTmu3N8f+0XO7nPHh52BiTclIOVU6XA6d/Dqr4+sjZnw2omGPZnhz/Tz0pHxXmtOgWU8aYuXj3pF4HDuAtBvV9Bau0MaaU8R6MAtgN1M6jq1uAC3xFaA/wBt4i1KsI6Q3Fu0fW0BhTEu9X86wDbLuB6lnjsqdjjNkLTAT65vLw30BZESmRY1l14H/5dPs+sAmo48vt2Ry5nYkyvgOROXP4O591Tvcc/A3UOKk/FycWflWMadEt3kYBSUBD4CNgpIicByAiVUWkh6/dJ8BtItLVd6CmqojU9Z3iVRtogXcsuAneMcuv8RbjM1UC7zDIURGpincsOcsy4B/gFRGJE5FoEWmbWyciUg64Ethw8mPGmN3AImCYr49GwB1496rzyy0ZOC4idYF782lfEA4RiRSR9kAfYEI+7U/3HIwFHhWRWuI9HXAo3rNE8vpmoIoZLbrFmDFmP94DZIOBp/AeVFvi+9o8A+94IMaYZcBtwEi8p4XNxbs3NQD4wRizzhizJ+sGvAn0EZGyZ5iaA2jqizUFmJQjZzfePdcLgV14D4Rdn2Pd1uI7TxfvmQv7gQfziNMP7zjn38B3gN0YMyOf3B4HbsQ7ZvwR8E1hNiwXe4DDvhy+Au4xxmw63Qr5PAefAl8C84DtQDp5b78qhrIOUCilCklEOgFjjDHVQp2LKj50T1cppYJIi67Kk4j8Iv/+JDfn7dlQ53Y6xTVvdW7Q4QWllAoi3dNVSqkg0qKrlFJBpEVXKaWCSIuuUkoFkRZdpZQKIi26SikVRFp0lVIqiLToKqVUEGnRVUqpINKiq5RSQaRFVymlgkiLrlJKBZEWXaWUCiItukopFURadJVSKoi06CqlVBDlOw22UkrlxeFwCHBVVFTUHYjUpGjT2QfK3oz09K+A0Xa7PTPUyejMEUqpM/by0KEvx8TEPNysbeu4MmXLIhJ+NfdYcjJrlq1IPXzo0IxMp/MKu90e0qKne7pKqTPicDjirVbrY1f3vykqLj4u1OnkqVLVKtSqc2Hs5+/9t1sm1AfWhzIfHdNVSp2pxNJly6aHc8HNYrPZqHFBLYB2oc5Fi65S6kzFREZGhjqHAouMirIBsaHOQ4uuUuqs5PF4yHnMKlzGm3VMVynlN6mpqTxw591s3riRiIgIatepw+3/uYtBjz9J/YYNWbt6NbGxcbz90QdcXK8ee/fs5T8DbuXYsWNkpKeT1LMn9qEvAeB0Onl58BBmTZ+O1WqlRs2afD5+HABvvf4GP33/Ay63i8pVqvDGu+9QsVJFhr/0Mpt+38ix5GT+2r2bX+bMonSZMqF8Sk6hRVcp5Tezp8/g2LFjLFy9EoAjhw+zYd06Nqxbz8sjXuPdTz5i3JivuP/Ou5mxcD6lSpdizMQJxMfHk5mZyXV9L2fmtOl07Z7Em6+9zs7t25m5eCGRkZEcPHAAgAljx7Fj+zamzpuNxWLhsw8/wv70M3ww+lMAVi1fwczFCyhXvnzInofT0aKrlPKb+o0a8semzTz5yKO0bd+epEt7AlCrdm3atm8PwHU39uOx+x/kWHIyFqsVx7ODWL5kKcYY9u3dy/q1v9G1exLTfp6K49WhZI0bZxXRqT9NYe2q1XRp3RYAt8tFiZKlsnPo1rN72BZc0DFdpZQf1axViwWrltOpSxfmzZ5NpxatSU/PyLP9B2+9zZHDR5g6bw5zly/l0r59yMhIP20MYwyPPv0kc5YuZs7SxcxfuZyfZ8/IfjwuLt5v2xMIWnSVUn7z91//w2K10uuyvrw4/FUOHjjAkUOH2bFtG4sXLARg4jfjqdegPiVKluTokaNUrFSJ6Oho/vnf30z9aUp2X9179eTDt9/F6XQCZA8v9OzTm88+/Igjhw8DkJGRwfrf1gV5S8+cDi8opfzm9w0beOn5wQC43W4efvwxKlWpxCUN6jNm9Oc8+fAjxMTE8u7HHwJw1333csdN/Wmf2JzKVavSvlOn7L4eevwxXnreTueWrYmIjKTWBRfw2divuO7Gfhw6cJDLu3uHLjweD7fdfRcNGjUM+vaeCf0ZsFLqjDgcjp6Vq1Ydd9XN/Uqdrt3CefOwPzOIGQvnByu1XM2fMcv528pVz9jt9jdCmYcOLyilVBBp0VVKnal0Z6Yz30ZtO3QI+V4uQEZGuhtIC3UeWnSVUmdq1eGDh6LTUlNDnUe+3G43u7Zt9wCLQp2Ljukqpc7Y0GFDR8TGxf+nRbs23ks7WsLjp7ZZjDEcTz7G6mXL0w7s2z8/0+nsqZd2VEoVW5nOzMePOg+vnjd95h1gqmP88+05NS2tdGxMzJGi9mMAEdmXkZ4+BvhvqAsu6J6uUioMicgQY8yQUOcRCDqmq5RSQaRFVymlgkiLrlJKBZEWXaWUCiI9kKaUCgiHw1ES6A3UopA7eOvWrevSsGHDWQFJrOAOAVPtdvs2f3aqRVcp5XcOh6OFzWadUa3aeVSpXC7WYrEUqugaY0REQlacjDEmOTklffOWXRaPx7w4aNBzQ/3VtxZdpZTfDRs2dEef3m1q1L+kVqhTKZLk5BTe/+/3aU5nZoLdbt/sjz51TFcp5VcOh6OmCOddUq9mqFMpspIl47jkkpoWoJe/+tSiq5Tyt/JxcTHOcJl9t6hKl4qPsljkPH/1p0VXKRUU1oiaHD+eEuo0AOjS9Xp+mjKzQG29fzz89xdEi65Sqthwu92hTqHItOgqpYLm9RH/pWnipdSr34WJk37JXn5z/4dp0bIvjZv04Kpr7ubw4aMAzJm7mCYJPbn9jsdpmngpv0ydw9Gjydxx5xM0btKDhKY9efAh7/RAx4+ncPsdj9OoSXcaNenOa69/kN3/77//Qes2l9OwcRL9bnrghMky//lnH9defy+tWl9O4yY9GPbKuwF9DvQqY0qpoLFaraxa+QubN/9Juw5X075dc847rzyjRtopX74sAM8Pfp3hr73PsKFPA7Dh9y28/97LtG6dCMDtdzxOfHwcq1f9gsVi4cCBQwC89PJbeDwe1q7+lWPHjtO2/VU0aHAxl/bszIBbH+WBB25lwC3XsGTJKtp3vCY7pwG3DeS5QQ/SoX1LnE4nSd1volmzRiR1ax+Q50CLrlIqaG6/7XoALr64NgkJDViydDWX9U3iyy8n8fXY73E6M0lJTaVOnX9PNatTp2Z2wQWY8vMsli+dTNapv1nFeubMhYx8w46IULJkCW64/jJmzlxI2zbNWL9hC/1vvgqAVq2a0rDBxQCkpKQyd+4SDuw/mN3/seMpbNq4VYuuUursNH/BMj74cAwL5k2kQoVyfD32Bz7++Ovsx+Pj4gIW2+PxICIsXfIjERERAYuTk47pKqWCZvTnEwD444/trFmzgVYtEzhyJJlSJUtQrlwZMjIyGD16/Gn76N2rC6+P+JCsH3ZlDS907dqWTz/7BmMMx44d55vxk+nWrR0lS5agYYOL+XrsDwAsW7aGdeu9v3MoUSKe9u2a8+rw97P73737b/bs2ef3bc+iRVcpFTQul4vEZr247Io7eP+9oZx3Xnl69ujIBbWrU/eSznTucj0JCQ1O28cbI57n2LHjNGrSnYSmPXnxpbcAeG7QQxhjaJzQg7btr+Lmm66kZ49OAIz+7A3efXc0jZp0Z9Rbn9C8WaPs/r78YhS/b/yDxk160LhJD/rd+ABHjiQH7DnQnwErpfzK4XA0K1u25Iz7772qVKhz8YcFC39j7rw1rz7//PNP+6M/3dNVSvmb2+PxnB0/RwPcbo/xeDz5zzVfQFp0lVL+tv3YsdSojIzMUOfhF7t37z0O+OViN6BFVynlZ3a7/YjNZp3205SF6SkpaaFO54y5XC5Wrtpsdv+1zw1M9le/OqarlPI7h8MRGxUV8bHL5b7SZrN6LIW8Nq7L7Y60Wa35fqUvaLvCMiCZma4Im822PiPDOcBut6/zV99adJVSAeNwOCKBChTyW/WHH3746N133z3SX+3O0BG73X7M351q0VVKhR0RGWKMGeKvduFEx3SVUiqItOgqpVQQadFVSqkgCsoFbxwOhwAXACWCES9AUoGtdrvdE+pElCrOHA5HKaA6YM2rTd26dSs5HI4m+fVV0HZ+ZoB/7Hb7GV2gIeAH0hwOR9+ICNvHVqs1Pjoq0kVx/J2KgQxnptWV6XK63O4HBg8e/HX+KymlcnI4HPGRUVHfuF2ubjFxcekWyfuLtsvlirLZbBl5NihkO38yGNJSUqIsFus6pzPjSrvd/ldh1g9o0XU4HHUjImwrb7i6Z2z1apUo7hPV/f3Pfr4a/3NqhtPZ2W63Lwt1PkoVJ8NeeeX7mhdc2KNDUo/oyMioUKdTJG63m9XLFrtWL1+6PdPpvNhutxe4kAZ0TNdqtdyU0Kiurcb5lYt9wQWoUrkCLZs3jIqIsN0a6lyUKk4cDkec2+Xq2b5r92JfcME7A0Ziq7Y2m81WBahfmHUDWnQjbLZLKp5XLjKQMYKtYoWyVpvVWqgnWSlF1ajo6Myo6OhQ5+E3IkLZ8hVcQO3CrBfQoisiVstJe7iOl0fgdPr9V3snmDxlGk8++2K+7VasWkv/2x7Ivm+Lq0pCi24ktkoisVUS69ZvPGUdi8UCkvcBAKVUriwillO+gg9/+cWA14OpUyYzZFD+V2Vcs2ol99w+APDOKHFplw50atWMTq2acf0Vfdi1c8cp61gsFuE0BwRzE/RTxl4c+gZOZ+GuPuRyuQrVvm/v7gwf+ny+7Zo1bcyXn71zwrL5s35g5ZLprFwynYYN6hUqrlKqcF4f9lKhi25h60HP3n0Z8vIr+bZr0jSRDz79HPDuXH3z3WTmLFnBnCUr6JrUg8FPP1mouHkJatF98NFnAWjf5XISWyWxa/f/uPu+x2nVoTcJLbrx6OODs+e179LzGgY+MZg2nfpwxbW3sWPnbipWb8Czg4fRrHV36id0YOXq3/jP/U+Q0KIbrTv2yZ5i4/Mvv+G6m+4CYM68RSS2SuLeB58koUU3mrbsxsZNf2Q/1rLdpcF8CpRSPk89+hAAvbt2pHPr5vy1exeP3n8PPTq2pWPLRAY9MTC7HlzRM4nnnnyMSzu3p/91V7Fr5w7qVq/CS/bn6NKmBW0SGrJ29SoGPnAvHVsm0rNTO/bu3QPAuDFfcPtNNwCwcN5cOrduzmMP3kfHlol0atWMLZs2Zj+W1L51dn4lS/17DfZjx45lT4RZVEEtum+PHAr8uzf50rCRdGjXiiXzprByyTT27T/AZ1+My26/bccu5s34np+++xKAgwcP07ZNc1YsnsZtt/Sje+/ruffuAaxeNoPEhIa8+9/Pco27YeMW7r7jFlYvm8E1V/dl6Ktv5plj157XkNgqiWcHDyMjI6hnoih1Tnl1pHeanSkz5zJ78XJGvDKU1u3a8+vchcxevJwD+/fz9Rejs9vv3LGdydNnM3bSjwAcOnSQlq3bMGvRMm4ccCtX9+nJbXffw9ylK2nUJIFPP3g/t7Bs3vg7t955N3OXruTyq67hjeF57wX3u+oy6l9QnR8mTmDo62/4ZbtD+ou0yVOmMWLUByS2SqJ5mx6sWvMbW/7Ylv14v+uuxGb79/cb8fFx9O7ZDYCEJg2oVqUyTRp751NqmtCIP7ftyDXOxXVqk9DE265l86b8uT33dts3L2Ppgl+YPW0SGzdt4aVXRvlhK5VSBfHrlJ94d9QbdG7dnK5tW7J2zSr+3PpH9uNXXXvDCfUgLj6epJ69AGjUOIEqVarSsFFjABonNGX7tj9zjXNhnYto2Nj7e4rE5i3YuX1bru0Axk76kXVbd3DltdfzxqvDiryNEOIp2A2Gid98wgW1auT6eFx87An3o6L+PRHCarUSHf3vqSdWiyXPsZ4T2lmtuF3uXNudX60qACVLluD2W/sx6u0PC7YhSqkiM8bw+bgJ1Kx1Qa6Px8WfOBV7VOSJn+ucZ0ZYrdY860FB22WxWCzcOOBWWjWuz/BRb+e7HfkJ+p5uiRLxHE32zrTZt1d3ho94N3vc5sCBQ2zfsSvYKQFw+PAR0tK8V7l3uVxM+m4KjRvqmWFKBVJ8iRIcSz4KQI/efXh7xGvZ9eDggQPs3LE9JHkd2L+fgwcOZN+fPGki9eqffpbiggr6nu6jD/2HpF7XERMdzfffjuaV196macskRISoqEhGDHdQq2b1YKfFpi1bue/BpxGBTJeL1i2b8cJg/xytVErl7t4HH+Gq3j2Ijo5hzIRJjHrtVTq3aoaIEBkVxYuvvk6NmrWCnte+vXt56J47yczMBGOoXrMm732c+zGjwgroz4CHv/rK9927tL68Yf06AYsRbH/8uYsff5mz8Mknn24X6lyUKi4cDkfduPgSywbc80BxvujVKSZ/Oy55947tt9nt9kkFXSegwwsG7zjN2cS3PWfXRikVeOZsqwVwZvUgoEXX5XLvOpp8/Ky6FOLR5GN43J7doc5DqWJmf0Z6WpS7kD9sCHfJR48A7C3MOgEuuq5vV6z+Pa04T8OcU3p6BktXrE/JcGaOy7+1UiqL3W4/ZLXZ1q9fs+qs2QnbtX0bKcePe4ClhVkv0AfS5judmW++/eG4gTWrV86Mi42xBfJqYy6XK8JmsxXuN8YFYIwhLT3DvX3n/2zAl8Bkf8dQ6mznzMi4bumCeQs2/LYmtmKlyjbfdQty5XK7I2xWa76f5bzaFXT9M+Exxhw+eMB1cP8+i9vt7mu323M/BzUPQZkN2OFwVAG6EOCZI5YtW9a7RYsWUwLUfQow12637wxQ/0qd9RwOhw3oAFzIaS4UU9DPcl7tAlwLDPAXMNNutxf6a/xZNQV7cZyOWSl1qqJOwR7OtUAnplRKqSDSoquUUkGkRVcppYIo1zFdh8NRAmgPlAx6RkUwd+7cazp27PhtEEIlAwvsdntyEGIpFXYcDofgnRusHoWcOaEgCvpZzqtdEGtBXjzALmCZ3W4/4TS5U4ruiy++eK+IjKxYoVxGXGyMUIzmTM90uaIiAj4dsyElNc3s3X8wCmOefO75598KbLyzl8PhSIyMjBpkMG2NMRHBjG2xWJLdbvdEt8v1kt1uPxzM2MWdw+EoHxkROctqtV5Q+byKbovV6vcikZmZGRUREZHvZzmvdgVdP1CMx2MOHDooKWmpqS6Xq7Pdbs+e++uEoutwOBpGRUYuvePma2PKlC6Va2fK68jRZD4eMz41I8PZwW63rwx1PsWNw+FoYrNFLGjRvlNstZq1JSIyiPOXGkNKynE2rF7u3PHHlq2Zmc4mdrs9IOd0no1eGfbK5AZ1L+nerX3nyLNhlu9AWvv7OjNz/uxdzszMWlnTtJ/w4wiLxXJN4wZ1bVpw81e6VEmaNqofuWzVb9cAWnQLKSIycmBCy7YxDRNbhuRTG1+yFOdVqhI5Ye+H5x8+uL8L8Gso8ihuHA5HhMViSWrXvLUW3AJoVK+BzFk0vzyZmRcDm+CkA2kRNlutsmVKB/VrXnFWtnQpW4TNVqjpl5WXiKVttRq1QnogV0SoUbtOHNAilHkUM6WtVquJiYkJdR7FgohQqmRJF1Ala9mJb3rBon+9Ck68E9XpGSBnxETZbKH/+x4REWERkej8WyofEeTs+UVVEIj3uFh2YQ1Kwfj7nz10631lkfrYsXMXH332Rb7t1vy2jgmTfihSrKx40WUqk9i2c/bt4MFDRe5XKXWqv//5h+6X9ypSHzt27eTjzz/Nt93adb/x7fcFvvxtnv7Zs4c2XTvQolMbEtu35Mbb+3P4SP7HZANedF0uF1UqV2LGlO+K1M/OXbv5ePSX+bZbu24D335X9KILULpUKVYunJ19K1eurF/6VXn76ftJtEuoT1KbZrz52jAqx9tIOX6cVcuXcvWlXenergXd27VgxlTvz+p379zBJdUrMmzIcyS1aUa7hEtYumhBdn8zf/2Zy7q1p3u7FvTp0paVy5aEatNUHrw1ojLTfvi5SP3s3LWLT3PMHpyXtet/Y+IPRS+65cuVY8bkqSybs4iV85dStXJVhr3+ar7r5Vt0hw5/g8eefj77/sGDh6hUsy4/T51Ou66X0qxdF5q06sg33/5bVLv2uoKBTz1H2y6XcuX1/dmxcxeVatbNfrz/HffQsmMSTVp15JobB3D48BEA5s5fSGLbztz78GMktO5I0zad2Lh5CwAPPfY0GzdtIbFtZ67vf3uuuR48eAjHy68yc848Ett25pEnnj1tPIDnXxhK3cYtaNO5J88MfoGWHZPyfdJUYOzfu5cnH7qHLyZ8z/RFK4j2jRsePXqEpx6+n/c+G8O0Bcv48tsfePKh+zh6xPs6Hj50kGYtWzF90Qoeffo5Xh78DAA7tv3JyFdf5qtJU5i2YBkj3vmQu2/pF7LtO1sNGzGcJwY9nX3/4KGDVL2oBr9Mm0rHnl1o2bktie1bMn7Sv6fNJl12KY8PeooOPTpz9c3Xs2PXTqpe9O8EtQP+cwdtunYgsX1LrrulX/Ye5NwF82nRqQ33D3yIZh1a0bxjazZt2QTAI08NZOOWTbTo1IZ+t92ca64HDx3khVdeZtbcObTo1IaBzzxx2ngAg192cEnzxrTv3plBjudp07UDABEREcTGeifPdbvdHE85jsWS/35svi1u7ncd4yd+lz1j5tgJk+jTqwetWzZn7rSfWLFgFr/+OIGnnhtyQjHbvmMnc6dNZvLEsaf0OXL4yyydO501S+ZySd26vJZjhs3fN27m7ttvZfXiuVx75eUMHe6da/6tEa9Qr+5FrFw4m2++zP0rRLlyZbEPeoqunTqwcuFsRr029LTxfvrlV6ZMncbKRbNZMPNntv554lTMyceO0bJjEi06dGPEm++cdbNghJtVK5bRsHECF1zond6pX//bAFi3ZjW7dm7npiv70K11Ijdd2QcRYfu2rYBvKu5L+wCQ2LwlO7Z5X8c5M6axY9s2ruzRmW6tE7n/jltwu1zs31uoa06rfNx0fT8mfPdtdo34ZuIEevfsRasWLZk1ZTpLZy/k54mTecY+6IRitn3HdmZNmc4P4yae0ueIoa+yaOY8Vs5fSr269Rjx1sjsx37ftJG7br2DFfOWcPXlVzFsxGsAjHr1DepdVJdlcxYx9rMxueZarmw5Bj89iC4dO7FsziLeGPbaaeNNmfozv0ybyvI5i5g7dSZbc5nWvUWnNlS7uBZbt/3Js088fcrjJ8v3errVz6/GJfUu5pdpM+jbqydffD2OEcNeZP+Bg9x1/8Ns/XM7VpuVQ4ePsPmPrbRq0QyAG6696oQ56nP6cux4xo6fiNOZSWpqCnVq/3sCwMV1LiShcUMAWjZP5KdfpuW7EfnJK96ceQu49srLiYvzTu3c/8bredlX5CtXqsiOTWs4r0IF9u3fz5XX30Lp0qW5Y0Duf0FV4BhjqFe/Id9Pm3PKY7t37iDypKm4sz78xhg6J/Xg7Y9GBynTc1P1audTr249pk7/lT6X9ubLsV8x/KVhHDhwgP88dB9bt/2JzWbj0JHDbNn6By2beU8Wuf7q6/KsEV99M5Zx347HmekkJTWVOrUvzH7sogvr0KRRYwBaNGvOz78WbVjidPHmLpjH1ZdflV0jbr7+RoaNGH7CusvmLCIzM5OBzzzBR599wmMPPXraWAUa073lxhv48utvWLfhd5KPJtOuTf6Q8vsAABSlSURBVCseGPgkHdq1ZfWSuaxcOJtqVSqTkfHvD0Di4+Jy7WvBoiV8+PFopkwax5olc3E89wzpOdaLij7pA+Qu2vQe+cXLS1RUFOdVqADAeRUq0O+6q1m0ZFmRclGn17RZC9atXc0O397E+K+8B04bNklg+59bWTh3dnbbNSuX5/vNo2PXJGZP/5XNv284YT3lf/1vuIkx33zN+t83cPRYMu1at+XBJx6lQ9v2rJy/lGVzFlG1chXS0wtQIxYv5MPPPuHH8ZNYOX8pQ555nvT09OzHo6P+PdnEarHichXqGuKFjlcQERER3HzDjXw9If9JZQpUdK+8rDfzFy5h5Nvvc8tNNyAiHDl6lJo1zkdEmDFrDlu3FWx++iNHj1KyVEnKlS1LRkYGo8d8XaD1SpQoQfLR/C91ULJEPEeT/213ungd27dl4g+TSU1NxePx8NW4CdmP7du/3zv9MpCamspPv0ylcUP/zHuvclehYkVeHfUeN1/dl6Q2zTh4YD8RERFUrlKVz8d/x4hhL9K1VVPaN23A60NfyLfoXnBhHd755HMG3n9X9npffvpRkLbm3HJFn8tYsHgho957i/433ISIcPToUWqcX91bI+bM4s/t2/LvCDh69CilSpakXNlyZGRk8PnX+R9AByhZosQJn/2CtjtdvA5t2/Pd5O+za0TOorr7f39x/PhxADweD99P/oH69ernG79A0/XExsbSt3dPPh8zlj/WrQBg6JDneHDgUziGDqdZ0wQaNrikIF3Ro1sXvv7mWy5JaEW5cuVo37YVy1euzne9Rg0u4aI6F9KkZQcuvujCPMd1u3TswBtvvUfTNp3o0LYNrw115Bmvb6+eLF66nKatO1GmTBlaNk/k8JGjACxcvBTHy8OxWC24Ml306pnE/f+5o0DbqM5c56Qe9L3qGgDGfTmaJs2aY7FYaJLYnElTZ53S/vwaNfl9194873fq2p1OXbsHPvFzXGxsLH169uaLsWPYtGo9AC8OdvDwEwN5cfhQEps0pWH9gu20dO+axNgJ39CgZQLly5alXeu2LF+V/48+G9ZvwEUXXkjTdi24uM5FeY7rdu7QiVHvvkXzjq1p36Ydr74wNM94fS7tzZLlS2nWsTVlS5ehRbPmHPYdwN2y9Q+eHvwsxhg8Hg+NGjTijWHDc42Z0wnXXnjllWFjurRvfVPTRvlX67PFsWPHKVEiHo/Hw90PPEqVSpV4wXf0Oz9rN2xixpyFE596+ulrApzmWWfYK6/svrzfrdXKlq9wwvJRw4fy03cTcblclC5ThuFvvc9FdesFLI+Vi+ezctG8oYMHDx4UsCBnEYfDcV5kROSOgf958Jz5SdqxY8coUaIEHo+Hex65n8qVKuN4dnCB1/98/FdH/9m352q73T4TTtrTNca4PJ5z6wj9bf+5nx27dpOelk5Ck0Y8/sgDBV7X4/FgMHqhlDMgSKoz49Rxs0eefJZHnnw2aHlkpKW5jDHHghaw+HN7jOec+hXmHfffzc7du0hLSyOhcQKPPfhIodb3GA9A9sDzCUU3M9O1/p89e9OgQdj/FWvZMSn7KHX2suaJvDfq9UL18+3Xn59xDv/s2Zee6cxcf8YdnMNcbtcvf27+/YJKVc8P9IzUeefgcvHnlo3pwOx8G6ssRzDGdfRYclSpEuF9ue02XTucciC+RWJz3hnxZqH6Gf/Fqae9FpTb7ebQkcNRQPa5Zifv6Y7/fcufQ+peVJsLa9UgnK/DsHTu9JDFNsawbcdu1m/c4vEYk//hSnUKt8s1YtNva27MSEsrUaP2RdG2iOBehyE15RgbVq9IyXRmzAb0tJQCstvt7mFDh301dda0m/t27xUbGxMb6pTytGjmvJDGdzqdzF2ywGkRWWu323dnLT/lIuYOh6NzZGTEZxgqRkVFhs1XZ5fLFWmz2ZxFbeMPGRlOG8JBpzPzdrvdHrrqX8w5HI6KInJLZFR0dxHO6NvVoUOHqpctW3ZXYdYxgPF49jozMiYA39rt9qKdl3iOcTgckZGRkR+53e7roqOiXBaLxe9jkgX9LJ/cLut+sGpBXozHkJaRHmWz2uZnODOut9vtB7Mey3MKdofDUQUImwvrfvnll/f379//3aK28ZNk4O+sixKr0AnnqbbPdg6HIwaoRgHPgiqMgn6WT26XdT+ItSAvHmCP3W4/evIDeRbdcFOQD5d+AM89+pqfnQr6up7cLut+OL8vzqmjkEopFWpadJVSKoi06CqlVBBp0VVKqSDSoquUCnsiEiUitUKdhz+EVdEVkRKhzkEpFZa6W0Q2R1htxf6C1mFVdGNtrBWRmqHOQykVdqznl6nkLhUT/9/YyOjXRCSsaldhhFXiDyVQI9rKGhFpF+pclFLhpVLJ8s5xdw6PrVmu6n1xkTFTRCQ+1DmdibAquv3rYRnViVKxNqZFWOS2UOejlAovZWJL8kn/IbEd6iR2io2MXk0Y/Wq2oMKq6AK0rwrjehFTNpp34iJklIhYQ52TUip8RNoieKHvfdF3tr2qllUs94hI61DnVBhhV3QBapeG7/oSW6c0d8VFME0PsCmlchIRbmnV19qtbqvomIioGcXpAFtYFl2A0tEwugexSdVpG2tjLRDeF+9USgVd9XKV+WzAi7GlYkr8NzYy+rVQ51MQYVt0ASKtcN1FRLkN1QjzXJVSoVGrXFXaX9g0whhzA7A73xVCLGRX7S+IydswQxaT6nRzvTFmSqjzUUqFl0xXJg9980rq+r+3rk93OXsZYw7mv1ZohWXR9RgYuYrMsZs5lO6mqzFmQ6hzUkqFl7+P7Gfi6pmZbuP+NtWZfqcxxWO+wrAruimZ8Ng80lbtY2Oai57GmP2hzkkpFV7W7N7EIxNeS0vJSJtnMLea4nJhcMKs6P59HO6aQer+VH5IcXGbMSYj1DkppcLLj2vnmOHTPktxujOvMZjWxangQpgV3at/Ii3DzQsZboYXtydSKRVYbuPhjRlfOL9bM+tAusvZ1RizqbidowthVnSPObnWowfMlFK5WLlzQ8mN/2xblpaZUSwOmOUlrE7D0oKrlMrDBpvVNjTFmdauOBdcCLM9XaWUyo0x5g9gUKjz8Iew2tNVSqmznRZdpZQKIi26SikVRFp0lVIqiLToKqVUEGnRVUqpINKiq5RSwWSMCZsb4AGMv28xkewDmuYV1xYTPz8QcfWmt3PxZomIOgy0z/NzHhP5Yz59eLDIMsBmjCEqUjZbrfTLo2YMwRa1CGvEI4DkXB7qepZnvSGMJDXh+OiHOOOpeUZ8D49dceryKSuo8PDHzLdY5BaPx0w8+XGL1Vap2xtTKVOnyZmGViGy4evXqH/jE6FOQ+WwZ9Xs0ouH3fmrxRbxgMeV+ekpDazWSgy8GurXyL2DlHTB/kUT0pwzReQyq4XapUtHfBIXa22UmuYZZIzxnNBeLHWIiH4Vl7OJiNwV7pd4DKvhBQFEzvxGHsv7NIfvniG2TBxfxEaJQ0Tk1OCC6K3Y3SD0OejtxFvlxC50Gzk1JqpU+bcjYuLelNwmlz3dhz0+BtrWj6Bl3ZZERawFmPNL85jaF8Q+GB9nnSwicaf0d80LkVS+6FoiY+aJSFk/lya/CquiG0iNasLMF4mtUYHH46OZJCIxoc5JqbNVyfMvosc7s2NL1ah3py0mfpqIFG6OQ4sFBiRFcU378xGs55WPZPoPiXHdu5brEh9nXSki1U5oHxkDVw2JpX5SUyKi1wHl/bg5fnXOFF2AiqVhymBiO9SnR1w0y0WkcqhzUupsFVWyLJ1f/SH2/HZ929qi49aISM1Cd9I1wYJ4y1R0tJVP3q0fPfDBGrVjYixrRaTFCW0tVuh8ZySd76qMWO8Si6W7HzbD786pogsQEwkf3U/MPT24OCaSdSLSNNQ5KXW2stgiaPbQyKgG/Z+uYY2KWSMi7QrdSY7BQBFh4AM1bR+/U79sbKxlNtDglPYNkoR6HSOJiPlebJEPi3ccKmycc0UXvMNGj12B7c07KRcTyXyDiQ91TkqdrUSEiy6/29Lm2U9L2aLjpmFMkb/69+5RgV+/T4yNjbFchphTx3hLngf9R8UQX24oETGnHswLobAqum4PscGM16c5vHQTsXg8lYIZV6lzUeXELrQY+E4MHk8tf/TX8JIS3HBNpQgMuR+fKVUJrhsWi3Hf6o94/hJWRbeozi/k38+fV8BzX+EBcQYmIxVocRXPD3UKqoD2rJrDsjce8CCSnm/j8vkfd1v/+3HGT9rrPa/3ZKUqwtG9MP5pD4a9Z5JvoIRV0bVaSC3K+tcXcLTIGBj5I64HP+JgmpPmYrHsKkpcFTq1ut0Q6hRUPowxbPnhQ8/Cl2896kpP6YTFsi7fldqdOlSb08/T9tPjipWpaenum0AOndKgdGUY80gaxw89htsZVgfMw+rHEcGQ5oSHPiJt7nq2pznpZoz5J6pEmVCnpdRZyePKZOW7T2bsXvDD3+6MtK7GmO1SopCjiCbHf41h5Ls73a+/uSM5Nc3T0xizTCJj3zqh/foZhln/TcHtvMZ4PL/6YTP86pwqunuPwI0jSN19gOnH0+lnjEkLdU5Kna0ykg+x4IX+qUd3bV7qSku5whiTXOhOZq3xYDwWgIwMD/cN3Jj+64wDO1PTPN2MMX+d0NbjhnmfOVk37QCujK7GmE3+2RL/OmeK7rqd0O910tKcjEhzYtcp3pUKnOTdfzD3uWtTM1OOfuxKTx1ojHEXqgO3B8bMzGDJxj0Yqu0/6LTeePtvKdt3pM09nuK+zhiTckJ7ZxpMGpLKni3ryUwP69mCz4mi+/MKePAjUp0uBrjc5ttQ56PU2WzPqjksGnp7msflfMid6fyk0B2kpMPb36eya/8KMjIvw8LBjj2Xp6Wlud/O9doLAN8OduJyTiAzLeyvvRDyK+6cdMWggFz1KDqS/Zz+KmMLAhVbb3o7126WiKgjnO4qY9GRk/Ppw0NUxHv4rjIWGSlbrFZuzLM/W9Tik68yFs43Cadv2YH85cjphhPC7RcrShV3Rf28nc3Df2FVdJVS6mwXVufpKqXU2U6LrlJKBZEWXaWUCiItukopFURadJVSKoi06CqlVBBp0VVKqSDSoquUUkGkRVcppYJIi65SSgWRFl2llAoiLbpKKRVEWnSVUiqItOgqpVQQhVXRFZG7z8XYoY6vsTX2uRQ/1MKq6AKhfDFC/UY4V7ddY59bscMhfkiFW9FVSqmzmhZdpZQKonAruh+eo7FDHV9ja+xzKX5I6RxpSikVROG2p6uUUme1sCi6IvKpiOwTkfXBjCEiQ0TkfyKyxnfr5VteTkRmi8hxEXnHD7GjRWSZiKwVkQ0i4vAtryUiS0Vkq4h8IyKRvuUdRGSViLhE5Jqixvf1aRWR1SLyk+/+aBHZnmPbm/iW1xWRxSKSISKP+yFuaRH5VkQ2ichGEWktImVFZLqI/OH7t4y/Y4vIxTm2bY2IJIvII0F8zR8WkfW+1/sR37KAbXce7++84nUSkaM5noPBp+vnDGO/5nvNfxOR70SkdI7HnvG95zeLSI+ixC6OwqLoAqOBniGKMdIY08R3+9m3LB14Hihy0fHJALoYYxoDTYCeItIKeNUX/0LgMHCHr/0u4Fbgaz/FB3gY2HjSsidybPsa37JDwEPA636K+yYw1RhTF2jsy+FpYKYxpg4w03ffr7GNMZuztg1IBFKB73wPB/Q1F5EGwF1AC7zb3EdELiSw2z2aU9/fecUDmJ/jOXghn37OJPZ0oIExphGwBXgGQEQuAW4A6vvWeU9ErEWIXeyERdE1xszD+8YLixjGmBRjzAK8H0R/xDbGmOO+uxG+mwG6AN/6ln8OXOFrv8MY8xvg8Ud8EakG9AY+LkCu+4wxy4FMP8QtBXQAPvH17TTGHAEux7u9cOJ2+y32SboCfxpjdubVwM+veT1gqTEm1RjjAuYCVxHA7c7j/Z1rvDPop9DrGGOm+bYdYAlQLUdO44wxGcaY7cBWvH+cglIHwkFYFN0Qe8D3FejTrK9fgeD7er8G2Id3L+BP4EiON+ZfQNUAhR8FPMmpRfxl37aPFJGoAMStBewHPvMNbXwsInFARWPMP742e4CKAYid0w3A2Bz3A/2arwfa+4YsYoFewPkEf7tPF6+1b7jrFxGpH+A8bgd+8f2/KrA7x2OBfN+HpXO96L4P1Mb7lf8fYESgAhlj3L6vutXw/mWvG6hYOYlIH2CfMWblSQ8948uhOVAWeCoA4W1AU+B9Y0wCkMKJX3Ex3tNnAnYKjW+c/DJggm9RwF9zY8xGvENH04CpwBrAfVKbgG53LjnljLcKqOEb7nob+D5QcUVkEOACvgpUjOLmnC66xpi9vmLoAT7C9zUnwDGPALOB1kBpEbH5HqoG/C8AIdsCl4nIDmAc0EVExhhj/vENe2QAnxGYbf8L+MsYs9R3/1u8RXiviFQG8P27LwCxs1wKrDLG7IXgvebGmE+MMYnGmA54x+u3ENztJq94xpjkrOEu35h2hIiU93dwEbkV6APcZP49N/V/ePf6swTqfR+2zumim/WG9LkS79fCQMSpkHX0VkRigCS8B5RmA1lnJwwAfvB3bGPMM8aYasaYmni/Zs8yxtyc48MoeMf6/L7txpg9wG4Rudi3qCvwO/Aj3u2FAG13Dv3IMbQQxNf8PN+/1fGO535NcLebvOKJSCXf646ItMBbBw76M7CI9MQ7pHWZMSb1pJxuEJEoEakF1AGW+TN22DPGhPyG90PxD94DCX8BdwQjBvAlsA74De+boXKO9jvwDuof97W/pAixGwGrfXHWA4N9yy/A+4bbivfrb5RveXNfzBS8H4YNfnoOOgE/+f4/y7ft64ExQLxveSVf7GTgiO//JYsQswmwwrft3wNlgHJ4j6b/AcwAygYodpzv+SuVY1mwXvP5eP/ArAW6+pYFbLvzeH/nFe8BYIMvtyVAm6J8FvOIvRXv2O0a3+2DHO0H4T2msRm4tCixi+NNf5GmlFJBdE4PLyilVLBp0VVKqSDSoquUUkGkRVcppYJIi65SSgWRFl2llAoiLbpKKRVEWnSVUiqI/g/VWm26DYhHRQAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "targetfile = '../notebooks/input_files/recA_amplicon.gb'\n", "feature_parse_specs_file = '../notebooks/input_files/recA_feature_parse_specs.yaml'\n", "\n", "with open(feature_parse_specs_file) as f:\n", " feature_parse_specs = yaml.safe_load(f)\n", "# we can't get accuracies for the dummy alignments used here\n", "feature_parse_specs['RecA_PacBio_amplicon']['gene']['return'] = 'mutations'\n", "feature_parse_specs['RecA_PacBio_amplicon']['barcode']['return'] = 'sequence'\n", "\n", "targets = alignparse.targets.Targets(seqsfile=targetfile,\n", " feature_parse_specs=feature_parse_specs,\n", " allow_extra_features=True)\n", "\n", "print(f\"There is just one target: {targets.target_names}\")\n", "target = targets.targets[0]\n", "\n", "print(f\"It has the following features: {target.feature_names}\")\n", "\n", "_ = targets.plot()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Location of features in 0-indexed scheme:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "termini5 0 147\n", "gene 147 1206\n", "spacer 1206 1285\n", "barcode 1285 1303\n", "termini3 1303 1342\n", "variant_tag5 32 33\n", "variant_tag3 1310 1311\n" ] } ], "source": [ "for feature in target.features:\n", " print(feature.name, feature.start, feature.end)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Make some queries\n", "We make some query sequences that we will align against the target.\n", "The name of each query is a description of all ways that it differs from target:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "random.seed(1) # for reproducible output\n", "nts = 'ACGT' # valid nucleotides\n", "\n", "def randseq(length):\n", " \"\"\"Random nucleotide sequence of given length.\"\"\"\n", " return ''.join(random.choices(nts, k=length))\n", "\n", "def mutseq(wtseq):\n", " \"\"\"Random mutant sequence from a wildtype sequence.\"\"\"\n", " return ''.join(random.choice([nt for nt in nts if nt != wt]) for wt in wtseq)\n", "\n", "def get_wt_query(target, ambiguous_features=('barcode', 'variant_tag5', 'variant_tag3')):\n", " \"\"\"`(description, seq)` for wildtype query, ambiguous features assigned.\"\"\"\n", " seq = target.seq\n", " description = []\n", " for fname in ambiguous_features:\n", " f = target.get_feature(fname)\n", " fseq = randseq(f.length)\n", " seq = seq[: f.start] + fseq + seq[f.end:]\n", " description.append(f\"{f.name}={fseq}\")\n", " assert len(seq) == len(target.seq)\n", " assert re.fullmatch(f\"[{nts}]+\", seq)\n", " return ','.join(description), seq\n", "\n", "queries = []\n", "\n", "# get two random (unmappable) queries\n", "queries.append(('unmapped_1', randseq(target.length)))\n", "queries.append(('unmapped_2', randseq(target.length // 2)))\n", "\n", "# get a fully wildtype query\n", "queries.append(get_wt_query(target))\n", "\n", "# query with query clipping at both ends and a codon substitution in gene\n", "desc, seq = get_wt_query(target)\n", "# add substitution to gene\n", "gene = target.get_feature('gene')\n", "geneseq = gene.seq\n", "mutstart = 6\n", "mutlength = 3\n", "mutcodon = mutseq(gene.seq[mutstart: mutstart + mutlength])\n", "sub_desc = []\n", "for i in range(mutlength):\n", " wt = geneseq[i + mutstart]\n", " mut = mutcodon[i]\n", " sub_desc.append(f\"{wt}{mutstart + i}{mut}\")\n", " geneseq = geneseq[: mutstart + i] + mut + geneseq[mutstart + i + 1:]\n", "seq = seq[: gene.start] + geneseq + seq[gene.end:]\n", "desc += f\",gene_{'-'.join(sub_desc)}\"\n", "# add clipping\n", "desc += ',query_clip5=9'\n", "seq = randseq(9) + seq\n", "desc += ',query_clip3=7'\n", "seq += randseq(7)\n", "# add to list of queries\n", "queries.append((desc, seq))\n", "\n", "# query with a deletion in gene and in insertion in spacer\n", "desc, seq = get_wt_query(target)\n", "delstart = 7\n", "dellength = 15\n", "geneseq = gene.seq[: delstart] + gene.seq[delstart + dellength:]\n", "spacer = target.get_feature('spacer')\n", "insstart = 4\n", "ins = 'G'\n", "spacerseq = spacer.seq[: insstart] + ins + spacer.seq[insstart:]\n", "seq = seq[: gene.start] + geneseq + seq[gene.end: spacer.start] + spacerseq + seq[spacer.end:] \n", "desc += f\",gene_del{delstart}to{delstart + dellength}\"\n", "desc += f\",spacer_ins{insstart}{ins}\"\n", "queries.append((desc, seq))\n", "\n", "# query with deletion spanning gene and spacer\n", "desc, seq = get_wt_query(target)\n", "delstartgene = gene.length - 7\n", "delendgene = delstartgene + 7\n", "geneseq = gene.seq[: delstartgene]\n", "delendspacer = 9\n", "spacerseq = spacer.seq[delendspacer:]\n", "seq = seq[: gene.start] + geneseq + spacerseq + seq[spacer.end:]\n", "desc += f\",gene_del{delstartgene}to{delendgene},spacer_del1to{delendspacer}\"\n", "queries.append((desc, seq))\n", "\n", "# query with insertion at boundary of gene and spacer, note how\n", "# it is assigned as being at end of gene\n", "desc, seq = get_wt_query(target)\n", "ins = randseq(14)\n", "seq = seq[: gene.end] + ins + seq[spacer.start:]\n", "desc += f\",gene_ins{gene.length}{ins}\"\n", "queries.append((desc, seq))\n", "\n", "# query with target clipping at both ends\n", "desc, seq = get_wt_query(target)\n", "target_clip5 = target.get_feature('termini5').length + 4\n", "target_clip3 = 2\n", "seq = seq[target_clip5: -target_clip3]\n", "desc = desc.split(',')\n", "desc = ','.join([desc[0], 'variant_tag5=None', desc[2]])\n", "desc += f\",target_clip5={target_clip5},target_clip3={target_clip3}\"\n", "desc += ',termini5=None,gene=,termini3='\n", "queries.append((desc, seq))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Align the queries to the target" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "mapper = alignparse.minimap2.Mapper(alignparse.minimap2.OPTIONS_CODON_DMS)\n", "\n", "with contextlib.ExitStack() as stack:\n", " \n", " # write queries to FASTA file\n", " queryfile = tempfile.NamedTemporaryFile('r+', suffix='.fasta')\n", " queryfile.write('\\n'.join(f\">{desc}\\n{seq}\" for desc, seq in queries))\n", " queryfile.flush()\n", " \n", " # align the queries to the target\n", " alignmentfile = tempfile.NamedTemporaryFile('r+', suffix='.sam')\n", " targets.align(queryfile.name, alignmentfile.name, mapper)\n", " \n", " # get the alignment cs data frame\n", " alignments_cs = targets._parse_alignment_cs(alignmentfile.name)\n", " \n", " # get the alignment results\n", " readstats, aligned, filtered = targets.parse_alignment(alignmentfile.name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Check results of `Targets._parse_alignment_cs`\n", "We now check if the feature-specific `cs` strings returned by `Targets._parse_alignment_cs` are correct.\n", "\n", "We expect there to be alignments for just our one target plus the number of unmapped reads:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted(alignments_cs.keys()) == [target.name, 'unmapped']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make sure the number of unmapped queries equals the number we passed in:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unmapped_queries = [(desc, query) for desc, query in queries if 'unmapped' in desc]\n", "len(unmapped_queries) == alignments_cs['unmapped']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make sure we get the expected queries mapped to our target:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mapped_queries = [tup for tup in queries if tup not in unmapped_queries]\n", "len(mapped_queries) == len(alignments_cs[target.name])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we look manually at whether we got the expected `cs` strings for features:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_namequery_clip5query_clip3termini5_cstermini5_clip5termini5_clip3gene_csgene_clip5gene_clip3spacer_csspacer_clip5spacer_clip3barcode_csbarcode_clip5barcode_clip3termini3_cstermini3_clip5termini3_clip3variant_tag5_csvariant_tag5_clip5variant_tag5_clip3variant_tag3_csvariant_tag3_clip5variant_tag3_clip3
0barcode=CTAACTCGAGCTGCAATG,variant_tag5=A,variant_tag3=T00:32*na:11400:105900:7900*nc*nt*na*na*nc*nt*nc*ng*na*ng*nc*nt*ng*nc*na*na*nt*ng00:7*nt:3100*na00*nt00
1barcode=ACTAAAACCCACCGCGCT,variant_tag5=A,variant_tag3=C,gene_A6G-T7A-C8G,query_clip5=9,query_clip3=797:32*na:11400:6*ag*ta*cg:105000:7900*na*nc*nt*na*na*na*na*nc*nc*nc*na*nc*nc*ng*nc*ng*nc*nt00:7*nc:3100*na00*nc00
2barcode=GTTACGACACGTTATTTC,variant_tag5=T,variant_tag3=C,gene_del7to22,spacer_ins4G00:32*nt:11400:7-tcgacgaaaacaaac:103700:4+g:7500*ng*nt*nt*na*nc*ng*na*nc*na*nc*ng*nt*nt*na*nt*nt*nt*nc00:7*nc:3100*nt00*nc00
3barcode=TAGTCTGTATCATAACTT,variant_tag5=A,variant_tag3=T,gene_del1052to1059,spacer_del1to900:32*na:11400:1052-agatttt00-taatcgtct:7000*nt*na*ng*nt*nc*nt*ng*nt*na*nt*nc*na*nt*na*na*nc*nt*nt00:7*nt:3100*na00*nt00
4barcode=TACCCTTGCTAAGTACGC,variant_tag5=T,variant_tag3=G,gene_ins1059GGGATAGGAGACTA00:32*nt:11400:1059+gggataggagacta00:7900*nt*na*nc*nc*nc*nt*nt*ng*nc*nt*na*na*ng*nt*na*nc*ng*nc00:7*ng:3100*nt00*ng00
5barcode=GCGTCACAAGTGCCGAAT,variant_tag5=None,variant_tag3=T,target_clip5=151,target_clip3=2,termini5=None,gene=<clip4>,termini3=<clip2>001470:105540:7900*ng*nc*ng*nt*nc*na*nc*na*na*ng*nt*ng*nc*nc*ng*na*na*nt00:7*nt:290210*nt00
\n", "
" ], "text/plain": [ " query_name \\\n", "0 barcode=CTAACTCGAGCTGCAATG,variant_tag5=A,variant_tag3=T \n", "1 barcode=ACTAAAACCCACCGCGCT,variant_tag5=A,variant_tag3=C,gene_A6G-T7A-C8G,query_clip5=9,query_clip3=7 \n", "2 barcode=GTTACGACACGTTATTTC,variant_tag5=T,variant_tag3=C,gene_del7to22,spacer_ins4G \n", "3 barcode=TAGTCTGTATCATAACTT,variant_tag5=A,variant_tag3=T,gene_del1052to1059,spacer_del1to9 \n", "4 barcode=TACCCTTGCTAAGTACGC,variant_tag5=T,variant_tag3=G,gene_ins1059GGGATAGGAGACTA \n", "5 barcode=GCGTCACAAGTGCCGAAT,variant_tag5=None,variant_tag3=T,target_clip5=151,target_clip3=2,termini5=None,gene=,termini3= \n", "\n", " query_clip5 query_clip3 termini5_cs termini5_clip5 termini5_clip3 \\\n", "0 0 0 :32*na:114 0 0 \n", "1 9 7 :32*na:114 0 0 \n", "2 0 0 :32*nt:114 0 0 \n", "3 0 0 :32*na:114 0 0 \n", "4 0 0 :32*nt:114 0 0 \n", "5 0 0 147 0 \n", "\n", " gene_cs gene_clip5 gene_clip3 spacer_cs \\\n", "0 :1059 0 0 :79 \n", "1 :6*ag*ta*cg:1050 0 0 :79 \n", "2 :7-tcgacgaaaacaaac:1037 0 0 :4+g:75 \n", "3 :1052-agatttt 0 0 -taatcgtct:70 \n", "4 :1059+gggataggagacta 0 0 :79 \n", "5 :1055 4 0 :79 \n", "\n", " spacer_clip5 spacer_clip3 \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "5 0 0 \n", "\n", " barcode_cs barcode_clip5 \\\n", "0 *nc*nt*na*na*nc*nt*nc*ng*na*ng*nc*nt*ng*nc*na*na*nt*ng 0 \n", "1 *na*nc*nt*na*na*na*na*nc*nc*nc*na*nc*nc*ng*nc*ng*nc*nt 0 \n", "2 *ng*nt*nt*na*nc*ng*na*nc*na*nc*ng*nt*nt*na*nt*nt*nt*nc 0 \n", "3 *nt*na*ng*nt*nc*nt*ng*nt*na*nt*nc*na*nt*na*na*nc*nt*nt 0 \n", "4 *nt*na*nc*nc*nc*nt*nt*ng*nc*nt*na*na*ng*nt*na*nc*ng*nc 0 \n", "5 *ng*nc*ng*nt*nc*na*nc*na*na*ng*nt*ng*nc*nc*ng*na*na*nt 0 \n", "\n", " barcode_clip3 termini3_cs termini3_clip5 termini3_clip3 variant_tag5_cs \\\n", "0 0 :7*nt:31 0 0 *na \n", "1 0 :7*nc:31 0 0 *na \n", "2 0 :7*nc:31 0 0 *nt \n", "3 0 :7*nt:31 0 0 *na \n", "4 0 :7*ng:31 0 0 *nt \n", "5 0 :7*nt:29 0 2 \n", "\n", " variant_tag5_clip5 variant_tag5_clip3 variant_tag3_cs variant_tag3_clip5 \\\n", "0 0 0 *nt 0 \n", "1 0 0 *nc 0 \n", "2 0 0 *nc 0 \n", "3 0 0 *nt 0 \n", "4 0 0 *ng 0 \n", "5 1 0 *nt 0 \n", "\n", " variant_tag3_clip3 \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "5 0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "with pd.option_context('display.max_colwidth', -1, 'display.max_columns', None):\n", " display(alignments_cs[target.name])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Check reseults of `Targets.parse_alignment`\n", "Now check the results or `Targets.parse_alignment`.\n", "\n", "First make sure the read stats are correct.\n", "Here are those read stats:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
categorycount
0aligned RecA_PacBio_amplicon3
1filtered RecA_PacBio_amplicon3
2unmapped2
\n", "
" ], "text/plain": [ " category count\n", "0 aligned RecA_PacBio_amplicon 3\n", "1 filtered RecA_PacBio_amplicon 3\n", "2 unmapped 2" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "readstats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make sure they are correct for number of unmapped reads:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "readstats.set_index('category')['count'].to_dict()['unmapped'] == len(unmapped_queries)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now look at the filtered reads and make sure that they are what we expect:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_namefilter_reason
0barcode=ACTAAAACCCACCGCGCT,variant_tag5=A,variant_tag3=C,gene_A6G-T7A-C8G,query_clip5=9,query_clip3=7query_clip5
1barcode=TAGTCTGTATCATAACTT,variant_tag5=A,variant_tag3=T,gene_del1052to1059,spacer_del1to9spacer mutation_nt_count
2barcode=GCGTCACAAGTGCCGAAT,variant_tag5=None,variant_tag3=T,target_clip5=151,target_clip3=2,termini5=None,gene=<clip4>,termini3=<clip2>termini5 clip5
\n", "
" ], "text/plain": [ " query_name \\\n", "0 barcode=ACTAAAACCCACCGCGCT,variant_tag5=A,variant_tag3=C,gene_A6G-T7A-C8G,query_clip5=9,query_clip3=7 \n", "1 barcode=TAGTCTGTATCATAACTT,variant_tag5=A,variant_tag3=T,gene_del1052to1059,spacer_del1to9 \n", "2 barcode=GCGTCACAAGTGCCGAAT,variant_tag5=None,variant_tag3=T,target_clip5=151,target_clip3=2,termini5=None,gene=,termini3= \n", "\n", " filter_reason \n", "0 query_clip5 \n", "1 spacer mutation_nt_count \n", "2 termini5 clip5 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "with pd.option_context('display.max_colwidth', -1, 'display.max_columns', None):\n", " display(filtered[target.name])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check that the aligned reads are also what we expect:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_namequery_clip5query_clip3gene_mutationsbarcode_sequencevariant_tag5_sequencevariant_tag3_sequence
0barcode=CTAACTCGAGCTGCAATG,variant_tag5=A,variant_tag3=T00CTAACTCGAGCTGCAATGAT
1barcode=GTTACGACACGTTATTTC,variant_tag5=T,variant_tag3=C,gene_del7to22,spacer_ins4G00del8to22GTTACGACACGTTATTTCTC
2barcode=TACCCTTGCTAAGTACGC,variant_tag5=T,variant_tag3=G,gene_ins1059GGGATAGGAGACTA00ins1060GGGATAGGAGACTATACCCTTGCTAAGTACGCTG
\n", "
" ], "text/plain": [ " query_name \\\n", "0 barcode=CTAACTCGAGCTGCAATG,variant_tag5=A,variant_tag3=T \n", "1 barcode=GTTACGACACGTTATTTC,variant_tag5=T,variant_tag3=C,gene_del7to22,spacer_ins4G \n", "2 barcode=TACCCTTGCTAAGTACGC,variant_tag5=T,variant_tag3=G,gene_ins1059GGGATAGGAGACTA \n", "\n", " query_clip5 query_clip3 gene_mutations barcode_sequence \\\n", "0 0 0 CTAACTCGAGCTGCAATG \n", "1 0 0 del8to22 GTTACGACACGTTATTTC \n", "2 0 0 ins1060GGGATAGGAGACTA TACCCTTGCTAAGTACGC \n", "\n", " variant_tag5_sequence variant_tag3_sequence \n", "0 A T \n", "1 T C \n", "2 T G " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "with pd.option_context('display.max_colwidth', -1, 'display.max_columns', None):\n", " display(aligned[target.name])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" }, "toc": { "nav_menu": {}, "number_sections": false, "sideBar": true, "skip_h1_title": false, "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 2 }