{ "cells": [ { "cell_type": "markdown", "id": "ba0e9f28-bcf7-460c-a94a-99aefa3790d0", "metadata": {}, "source": [ "# Introduction to pyalign\n", "\n", "> Bernhard Liebl
\n", "> Computational Humanities Group
\n", "> Leipzig University" ] }, { "cell_type": "markdown", "id": "3ccc55c2-dc2a-4a2f-b928-7af323135abe", "metadata": {}, "source": [ "In the following sections, this notebook will introduce you to the core concepts of `pyalign`. This will enable you to\n", "\n", "* correctly and efficiently formulate specific alignment problems\n", "* choose the right solver for solving them\n", "* take care of runtime performance considerations to obtain fast results" ] }, { "cell_type": "markdown", "id": "7b40f0bd-ecff-48e4-942c-661461a13487", "metadata": {}, "source": [ "### Setup" ] }, { "cell_type": "code", "execution_count": 1, "id": "6ce19464-0d96-41f8-b993-e64389df0770", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " Loading BokehJS ...\n", "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", "(function(root) {\n", " function now() {\n", " return new Date();\n", " }\n", "\n", " var force = true;\n", "\n", " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", " root._bokeh_onload_callbacks = [];\n", " root._bokeh_is_loading = undefined;\n", " }\n", "\n", " var JS_MIME_TYPE = 'application/javascript';\n", " var HTML_MIME_TYPE = 'text/html';\n", " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", " var CLASS_NAME = 'output_bokeh rendered_html';\n", "\n", " /**\n", " * Render data to the DOM node\n", " */\n", " function render(props, node) {\n", " var script = document.createElement(\"script\");\n", " node.appendChild(script);\n", " }\n", "\n", " /**\n", " * Handle when an output is cleared or removed\n", " */\n", " function handleClearOutput(event, handle) {\n", " var cell = handle.cell;\n", "\n", " var id = cell.output_area._bokeh_element_id;\n", " var server_id = cell.output_area._bokeh_server_id;\n", " // Clean up Bokeh references\n", " if (id != null && id in Bokeh.index) {\n", " Bokeh.index[id].model.document.clear();\n", " delete Bokeh.index[id];\n", " }\n", "\n", " if (server_id !== undefined) {\n", " // Clean up Bokeh references\n", " var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", " cell.notebook.kernel.execute(cmd, {\n", " iopub: {\n", " output: function(msg) {\n", " var id = msg.content.text.trim();\n", " if (id in Bokeh.index) {\n", " Bokeh.index[id].model.document.clear();\n", " delete Bokeh.index[id];\n", " }\n", " }\n", " }\n", " });\n", " // Destroy server and session\n", " var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", " cell.notebook.kernel.execute(cmd);\n", " }\n", " }\n", "\n", " /**\n", " * Handle when a new output is added\n", " */\n", " function handleAddOutput(event, handle) {\n", " var output_area = handle.output_area;\n", " var output = handle.output;\n", "\n", " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", " if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n", " return\n", " }\n", "\n", " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", "\n", " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", " // store reference to embed id on output_area\n", " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", " }\n", " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", " var bk_div = document.createElement(\"div\");\n", " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", " var script_attrs = bk_div.children[0].attributes;\n", " for (var i = 0; i < script_attrs.length; i++) {\n", " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", " }\n", " // store reference to server id on output_area\n", " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", " }\n", " }\n", "\n", " function register_renderer(events, OutputArea) {\n", "\n", " function append_mime(data, metadata, element) {\n", " // create a DOM node to render to\n", " var toinsert = this.create_output_subarea(\n", " metadata,\n", " CLASS_NAME,\n", " EXEC_MIME_TYPE\n", " );\n", " this.keyboard_manager.register_events(toinsert);\n", " // Render to node\n", " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", " render(props, toinsert[toinsert.length - 1]);\n", " element.append(toinsert);\n", " return toinsert\n", " }\n", "\n", " /* Handle when an output is cleared or removed */\n", " events.on('clear_output.CodeCell', handleClearOutput);\n", " events.on('delete.Cell', handleClearOutput);\n", "\n", " /* Handle when a new output is added */\n", " events.on('output_added.OutputArea', handleAddOutput);\n", "\n", " /**\n", " * Register the mime type and append_mime function with output_area\n", " */\n", " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", " /* Is output safe? */\n", " safe: true,\n", " /* Index of renderer in `output_area.display_order` */\n", " index: 0\n", " });\n", " }\n", "\n", " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", " if (root.Jupyter !== undefined) {\n", " var events = require('base/js/events');\n", " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", "\n", " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", " register_renderer(events, OutputArea);\n", " }\n", " }\n", "\n", " \n", " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", " root._bokeh_timeout = Date.now() + 5000;\n", " root._bokeh_failed_load = false;\n", " }\n", "\n", " var NB_LOAD_WARNING = {'data': {'text/html':\n", " \"
\\n\"+\n", " \"

\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"

\\n\"+\n", " \"\\n\"+\n", " \"\\n\"+\n", " \"from bokeh.resources import INLINE\\n\"+\n", " \"output_notebook(resources=INLINE)\\n\"+\n", " \"\\n\"+\n", " \"
\"}};\n", "\n", " function display_loaded() {\n", " var el = document.getElementById(\"1001\");\n", " if (el != null) {\n", " el.textContent = \"BokehJS is loading...\";\n", " }\n", " if (root.Bokeh !== undefined) {\n", " if (el != null) {\n", " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", " }\n", " } else if (Date.now() < root._bokeh_timeout) {\n", " setTimeout(display_loaded, 100)\n", " }\n", " }\n", "\n", "\n", " function run_callbacks() {\n", " try {\n", " root._bokeh_onload_callbacks.forEach(function(callback) {\n", " if (callback != null)\n", " callback();\n", " });\n", " } finally {\n", " delete root._bokeh_onload_callbacks\n", " }\n", " console.debug(\"Bokeh: all callbacks have finished\");\n", " }\n", "\n", " function load_libs(css_urls, js_urls, callback) {\n", " if (css_urls == null) css_urls = [];\n", " if (js_urls == null) js_urls = [];\n", "\n", " root._bokeh_onload_callbacks.push(callback);\n", " if (root._bokeh_is_loading > 0) {\n", " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", " return null;\n", " }\n", " if (js_urls == null || js_urls.length === 0) {\n", " run_callbacks();\n", " return null;\n", " }\n", " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", "\n", " function on_load() {\n", " root._bokeh_is_loading--;\n", " if (root._bokeh_is_loading === 0) {\n", " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", " run_callbacks()\n", " }\n", " }\n", "\n", " function on_error(url) {\n", " console.error(\"failed to load \" + url);\n", " }\n", "\n", " for (let i = 0; i < css_urls.length; i++) {\n", " const url = css_urls[i];\n", " const element = document.createElement(\"link\");\n", " element.onload = on_load;\n", " element.onerror = on_error.bind(null, url);\n", " element.rel = \"stylesheet\";\n", " element.type = \"text/css\";\n", " element.href = url;\n", " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", " document.body.appendChild(element);\n", " }\n", "\n", " const hashes = {\"https://cdn.bokeh.org/bokeh/release/bokeh-2.3.2.min.js\": \"XypntL49z55iwGVUW4qsEu83zKL3XEcz0MjuGOQ9SlaaQ68X/g+k1FcioZi7oQAc\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.3.2.min.js\": \"bEsM86IHGDTLCS0Zod8a8WM6Y4+lafAL/eSiyQcuPzinmWNgNO2/olUF0Z2Dkn5i\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.3.2.min.js\": \"TX0gSQTdXTTeScqxj6PVQxTiRW8DOoGVwinyi1D3kxv7wuxQ02XkOxv0xwiypcAH\"};\n", "\n", " for (let i = 0; i < js_urls.length; i++) {\n", " const url = js_urls[i];\n", " const element = document.createElement('script');\n", " element.onload = on_load;\n", " element.onerror = on_error.bind(null, url);\n", " element.async = false;\n", " element.src = url;\n", " if (url in hashes) {\n", " element.crossOrigin = \"anonymous\";\n", " element.integrity = \"sha384-\" + hashes[url];\n", " }\n", " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", " document.head.appendChild(element);\n", " }\n", " };\n", "\n", " function inject_raw_css(css) {\n", " const element = document.createElement(\"style\");\n", " element.appendChild(document.createTextNode(css));\n", " document.body.appendChild(element);\n", " }\n", "\n", " \n", " var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.3.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.3.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.3.2.min.js\"];\n", " var css_urls = [];\n", " \n", "\n", " var inline_js = [\n", " function(Bokeh) {\n", " Bokeh.set_log_level(\"info\");\n", " },\n", " function(Bokeh) {\n", " \n", " \n", " }\n", " ];\n", "\n", " function run_inline_js() {\n", " \n", " if (root.Bokeh !== undefined || force === true) {\n", " \n", " for (var i = 0; i < inline_js.length; i++) {\n", " inline_js[i].call(root, root.Bokeh);\n", " }\n", " if (force === true) {\n", " display_loaded();\n", " }} else if (Date.now() < root._bokeh_timeout) {\n", " setTimeout(run_inline_js, 100);\n", " } else if (!root._bokeh_failed_load) {\n", " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", " root._bokeh_failed_load = true;\n", " } else if (force !== true) {\n", " var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n", " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", " }\n", "\n", " }\n", "\n", " if (root._bokeh_is_loading === 0) {\n", " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", " run_inline_js();\n", " } else {\n", " load_libs(css_urls, js_urls, function() {\n", " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", " run_inline_js();\n", " });\n", " }\n", "}(window));" ], "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n var el = document.getElementById(\"1001\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n const hashes = {\"https://cdn.bokeh.org/bokeh/release/bokeh-2.3.2.min.js\": \"XypntL49z55iwGVUW4qsEu83zKL3XEcz0MjuGOQ9SlaaQ68X/g+k1FcioZi7oQAc\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.3.2.min.js\": \"bEsM86IHGDTLCS0Zod8a8WM6Y4+lafAL/eSiyQcuPzinmWNgNO2/olUF0Z2Dkn5i\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.3.2.min.js\": \"TX0gSQTdXTTeScqxj6PVQxTiRW8DOoGVwinyi1D3kxv7wuxQ02XkOxv0xwiypcAH\"};\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n if (url in hashes) {\n element.crossOrigin = \"anonymous\";\n element.integrity = \"sha384-\" + hashes[url];\n }\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n \n var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-2.3.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-2.3.2.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-2.3.2.min.js\"];\n var css_urls = [];\n \n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n function(Bokeh) {\n \n \n }\n ];\n\n function run_inline_js() {\n \n if (root.Bokeh !== undefined || force === true) {\n \n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n if (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import bokeh.io\n", "bokeh.io.output_notebook()" ] }, { "cell_type": "markdown", "id": "51a91080-a675-4dfe-a987-4faad280d44a", "metadata": {}, "source": [ "### Problems, Solvers and Solutions" ] }, { "cell_type": "markdown", "id": "4d6d1444-0f40-48d8-b115-63cccd0d6961", "metadata": {}, "source": [ "To compute an optimal alignment with *pyalign* we first need to state an alignment problem. The most common and efficient way to state an alignment problem in *pyalign* consists of three parts:\n", "\n", "* An alphabet $\\Omega$ of all elements that may occur in a sequence\n", "* Two sequences $s, t$ from $\\Omega$ [^1] that are going to be aligned\n", "* A function $w(x, y)$ which gives similarity (or distance) between $x$ and $y$\n", "\n", "[^1] $s=s_1,...,s_n$ and $t=t_1,...,t_m$, with $\\forall i, s_i \\in \\Omega$ and $\\forall j, t_j \\in \\Omega$ \n", "\n", "Let us first define $w(x, y)$ as the following binary measure of similarity:\n", "\n", "$$\n", "w(x, y) =\n", "\\begin{cases}\n", "1 && \\text{if} \\quad x = y\\\\\n", "-1 && else\n", "\\end{cases}\n", "$$\n", "\n", "Through *pyalign*'s `problems.alphabetic`, the following code cell defines the set of problems that can be posed over the alphabet $\\{A, G, C, T\\}$ and using $w(x, y)$ as a measure of similarity as just defined. Since the values from $w(x, y)$ are similarities and higher values mean better alignments, we configure `direction` to *maximize* in order to find optimal alignments. We write $w(x, y)$ as a lambda." ] }, { "cell_type": "code", "execution_count": 2, "id": "46401d3f-65ca-49c3-babd-8ef8e9763393", "metadata": {}, "outputs": [], "source": [ "import pyalign.problems\n", "\n", "problem_set = pyalign.problems.alphabetic(\n", " \"AGCT\",\n", " lambda x, y: 1 if x == y else -1,\n", " direction=\"maximize\")" ] }, { "cell_type": "markdown", "id": "94dbb0ab-16d4-4fc4-8782-8c14c3f82784", "metadata": {}, "source": [ "Using `problem_set`, we can now pose a problem for two specific sequences with all the conditions just defined above:" ] }, { "cell_type": "code", "execution_count": 3, "id": "12936204-5640-4ed8-afa7-c08f47942851", "metadata": {}, "outputs": [], "source": [ "problem = problem_set.new_problem(\"AATCG\", \"AACG\")" ] }, { "cell_type": "markdown", "id": "30a8dc1a-35ba-40a4-ac3b-03796b605b07", "metadata": {}, "source": [ "We now create a solver to solve the problem just posed. We choose a `LocalSolver`, which means we want to obtain an optimal *local* alignment. In later sections, we will see that `pyalign` also offers analog similar classes for *global* and *semiglobal* alignment problems as well as *elastic* - dynamic time warping - problems.\n", "\n", "Later, we will talk about how to specify a specific sort of gap cost here - i.e. the cost incurred when skipping parts of the aligned sequences - but for now we will not specify any gap cost - i.e. we do not incur any such cost.\n", "\n", "\n", "
For performance reasons, try to avoid recreating Solver objects, and instead try to reuse them for as many Problem instances as possible.
" ] }, { "cell_type": "code", "execution_count": 4, "id": "7a64decf-0158-469b-a613-b7cd6f8fb326", "metadata": {}, "outputs": [], "source": [ "import pyalign.solve\n", "\n", "solver = pyalign.solve.LocalSolver()" ] }, { "cell_type": "markdown", "id": "2ca190e4-de41-4a05-8759-7de7605b6d35", "metadata": {}, "source": [ "To solve the problem, we run `solve`. We obtain a `Solution` object that contains the optimal alignment." ] }, { "cell_type": "code", "execution_count": 5, "id": "701dae54-3702-4b92-b1e0-4fd4f8aa0c54", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "?? a (1, 5) (5, 4)\n" ] }, { "data": { "text/plain": [ "pyalign.solve.Solution" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "solution = solver.solve(problem)\n", "type(solution)" ] }, { "cell_type": "markdown", "id": "ec3b4929-5005-4637-a980-2d4b35ef2144", "metadata": {}, "source": [ "Here is the optimal alignment associated with the obtained solution:" ] }, { "cell_type": "code", "execution_count": 6, "id": "0778ed0b-8572-4bf8-8abb-c413287224a2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t
AATCG
|| ||
AA CG
\n", "\t\t\t" ], "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "solution.alignment" ] }, { "cell_type": "markdown", "id": "64b08acf-d9cd-4a79-9b76-6a5720b4ae84", "metadata": {}, "source": [ "And here is the score that is achieved with this optimal alignment:" ] }, { "cell_type": "code", "execution_count": 7, "id": "3755e7ea-2cfa-40a0-af58-c1c378d08552", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4.0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "solution.score" ] }, { "cell_type": "markdown", "id": "772f76d4-6439-4989-bbfa-9f610a08d0f0", "metadata": {}, "source": [ "The `Solution` object contains various other data that allows us to reason about why this solution was obtained. By displaying it in a Jupyter notebook, a `Solution` object gives the obtained value matrix that has been annotated with traceback paths (see arrows between cells) and the actually obtained path (see shaded cells)." ] }, { "cell_type": "code", "execution_count": 8, "id": "896529da-7769-47be-b3c1-0e1b4121ec1f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "
\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "(function(root) {\n", " function embed_document(root) {\n", " \n", " var docs_json = {\"44b66690-7e77-4528-b080-beb69e96c4ec\":{\"defs\":[],\"roots\":{\"references\":[{\"attributes\":{\"above\":[{\"id\":\"1013\"}],\"center\":[{\"id\":\"1016\"},{\"id\":\"1020\"},{\"id\":\"1051\"},{\"id\":\"1056\"},{\"id\":\"1060\"}],\"height\":290,\"left\":[{\"id\":\"1017\"}],\"renderers\":[{\"id\":\"1038\"},{\"id\":\"1043\"},{\"id\":\"1047\"}],\"title\":null,\"toolbar\":{\"id\":\"1028\"},\"toolbar_location\":null,\"width\":250,\"x_range\":{\"id\":\"1005\"},\"x_scale\":{\"id\":\"1009\"},\"y_range\":{\"id\":\"1007\"},\"y_scale\":{\"id\":\"1011\"}},\"id\":\"1003\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1067\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"ticks\":[1,2,3,4]},\"id\":\"1062\",\"type\":\"FixedTicker\"},{\"attributes\":{\"data\":{\"height\":[1,1,1,1,1],\"width\":[1,1,1,1,1],\"x\":{\"__ndarray__\":\"BAADAAIAAgABAA==\",\"dtype\":\"int16\",\"order\":\"little\",\"shape\":[5]},\"y\":{\"__ndarray__\":\"BQAEAAMAAgABAA==\",\"dtype\":\"int16\",\"order\":\"little\",\"shape\":[5]}},\"selected\":{\"id\":\"1076\"},\"selection_policy\":{\"id\":\"1075\"}},\"id\":\"1045\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1073\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.25},\"fill_color\":{\"value\":\"orange\"},\"height\":{\"field\":\"height\"},\"line_color\":{\"value\":null},\"width\":{\"field\":\"width\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1046\",\"type\":\"Rect\"},{\"attributes\":{},\"id\":\"1009\",\"type\":\"LinearScale\"},{\"attributes\":{\"formatter\":{\"id\":\"1069\"},\"major_label_overrides\":{\"1\":\"A\",\"2\":\"A\",\"3\":\"T\",\"4\":\"C\",\"5\":\"G\"},\"major_label_policy\":{\"id\":\"1070\"},\"ticker\":{\"id\":\"1064\"}},\"id\":\"1017\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1068\",\"type\":\"AllLabels\"},{\"attributes\":{\"data_source\":{\"id\":\"1045\"},\"glyph\":{\"id\":\"1046\"},\"hover_glyph\":null,\"muted_glyph\":null,\"view\":{\"id\":\"1048\"}},\"id\":\"1047\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1074\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1078\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"end\":{\"id\":\"1050\"},\"line_color\":{\"value\":\"blue\"},\"source\":{\"id\":\"1049\"},\"start\":null,\"x_end\":{\"field\":\"x_end\"},\"x_start\":{\"field\":\"x_start\"},\"y_end\":{\"field\":\"y_end\"},\"y_start\":{\"field\":\"y_start\"}},\"id\":\"1051\",\"type\":\"Arrow\"},{\"attributes\":{\"source\":{\"id\":\"1040\"}},\"id\":\"1044\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1069\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"syncable\":false,\"top_units\":\"screen\"},\"id\":\"1027\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"data\":{\"x_end\":{\"__ndarray__\":\"zwngel7e9j8zMzMzMzMDQDMzMzMzMwtAPieA63l52z8zMzMzMzMDQDMzMzMzMwtAAAAAAAAA8D8AAAAAAAAIQAAAAAAAABBAAAAAAAAA8D8AAAAAAAAAQDMzMzMzMwtAAAAAAAAA8D8AAAAAAAAAQAAAAAAAAAhA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[15]},\"x_start\":{\"__ndarray__\":\"MfYfhaEh+T/NzMzMzMwEQM3MzMzMzAxAYew/CkND4j/NzMzMzMwEQM3MzMzMzAxAAAAAAAAA8D8AAAAAAAAIQAAAAAAAABBAAAAAAAAA8D8AAAAAAAAAQM3MzMzMzAxAAAAAAAAA8D8AAAAAAAAAQAAAAAAAAAhA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[15]},\"y_end\":{\"__ndarray__\":\"PieA63l52z8AAAAAAADwPwAAAAAAAPA/zwngel7e9j8AAAAAAAAAQAAAAAAAAABAMzMzMzMzA0AzMzMzMzMDQDMzMzMzMwNAMzMzMzMzC0AzMzMzMzMLQAAAAAAAABBAmpmZmZmZEUCamZmZmZkRQJqZmZmZmRFA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[15]},\"y_start\":{\"__ndarray__\":\"Yew/CkND4j8AAAAAAADwPwAAAAAAAPA/MfYfhaEh+T8AAAAAAAAAQAAAAAAAAABAzczMzMzMBEDNzMzMzMwEQM3MzMzMzARAzczMzMzMDEDNzMzMzMwMQAAAAAAAABBAZmZmZmZmEkBmZmZmZmYSQGZmZmZmZhJA\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[15]}},\"selected\":{\"id\":\"1079\"},\"selection_policy\":{\"id\":\"1078\"}},\"id\":\"1049\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1079\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"1045\"}},\"id\":\"1048\",\"type\":\"CDSView\"},{\"attributes\":{\"formatter\":{\"id\":\"1067\"},\"major_label_overrides\":{\"1\":\"A\",\"2\":\"A\",\"3\":\"C\",\"4\":\"G\"},\"major_label_policy\":{\"id\":\"1068\"},\"ticker\":{\"id\":\"1062\"}},\"id\":\"1013\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1070\",\"type\":\"AllLabels\"},{\"attributes\":{\"line_alpha\":{\"value\":0.5},\"line_color\":{\"value\":\"blue\"},\"size\":{\"value\":5}},\"id\":\"1050\",\"type\":\"OpenHead\"},{\"attributes\":{},\"id\":\"1080\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1026\",\"type\":\"HelpTool\"},{\"attributes\":{\"end\":{\"id\":\"1055\"},\"line_color\":{\"value\":\"orange\"},\"line_width\":{\"value\":2},\"source\":{\"id\":\"1054\"},\"start\":null,\"x_end\":{\"field\":\"x_end\"},\"x_start\":{\"field\":\"x_start\"},\"y_end\":{\"field\":\"y_end\"},\"y_start\":{\"field\":\"y_start\"}},\"id\":\"1056\",\"type\":\"Arrow\"},{\"attributes\":{},\"id\":\"1081\",\"type\":\"Selection\"},{\"attributes\":{\"line_color\":{\"value\":\"orange\"},\"size\":{\"value\":5}},\"id\":\"1055\",\"type\":\"OpenHead\"},{\"attributes\":{},\"id\":\"1075\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1005\",\"type\":\"DataRange1d\"},{\"attributes\":{\"data\":{\"x_end\":{\"__ndarray__\":\"AAAAel7eCkAAAAB6Xt4CQAAAAAAAAABAAAAA9Ly89T8=\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[4]},\"x_start\":{\"__ndarray__\":\"AAAAhqEhDUAAAACGoSEFQAAAAAAAAABAAAAADEND+j8=\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[4]},\"y_end\":{\"__ndarray__\":\"AAAAPS9vEUAAAAB6Xt4KQAAAAGZmZgJAAAAA9Ly89T8=\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[4]},\"y_start\":{\"__ndarray__\":\"AAAAw9CQEkAAAACGoSENQAAAAJqZmQVAAAAADEND+j8=\",\"dtype\":\"float64\",\"order\":\"little\",\"shape\":[4]}},\"selected\":{\"id\":\"1081\"},\"selection_policy\":{\"id\":\"1080\"}},\"id\":\"1054\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1076\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1082\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"axis\":{\"id\":\"1017\"},\"dimension\":1,\"grid_line_color\":null,\"ticker\":null},\"id\":\"1020\",\"type\":\"Grid\"},{\"attributes\":{\"flipped\":true},\"id\":\"1007\",\"type\":\"DataRange1d\"},{\"attributes\":{\"data_source\":{\"id\":\"1035\"},\"glyph\":{\"id\":\"1036\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1037\"},\"view\":{\"id\":\"1039\"}},\"id\":\"1038\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1083\",\"type\":\"Selection\"},{\"attributes\":{\"data\":{\"color\":[\"gray\",\"gray\",\"gray\",\"gray\",\"gray\",\"gray\",\"black\",\"black\",\"black\",\"black\",\"gray\",\"black\",\"black\",\"black\",\"black\",\"gray\",\"black\",\"black\",\"black\",\"black\",\"gray\",\"black\",\"black\",\"black\",\"black\",\"gray\",\"black\",\"black\",\"black\",\"black\"],\"value\":[\"0.0\",\"0.0\",\"0.0\",\"0.0\",\"0.0\",\"0.0\",\"1.0\",\"1.0\",\"1.0\",\"1.0\",\"0.0\",\"1.0\",\"2.0\",\"2.0\",\"2.0\",\"0.0\",\"1.0\",\"2.0\",\"2.0\",\"2.0\",\"0.0\",\"1.0\",\"2.0\",\"3.0\",\"3.0\",\"0.0\",\"1.0\",\"2.0\",\"3.0\",\"4.0\"],\"x\":[0,1,2,3,4,0,1,2,3,4,0,1,2,3,4,0,1,2,3,4,0,1,2,3,4,0,1,2,3,4],\"y\":[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5]},\"selected\":{\"id\":\"1083\"},\"selection_policy\":{\"id\":\"1082\"}},\"id\":\"1059\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"active_multi\":null,\"tools\":[{\"id\":\"1021\"},{\"id\":\"1022\"},{\"id\":\"1023\"},{\"id\":\"1024\"},{\"id\":\"1025\"},{\"id\":\"1026\"}]},\"id\":\"1028\",\"type\":\"Toolbar\"},{\"attributes\":{\"ticks\":[1,2,3,4,5]},\"id\":\"1064\",\"type\":\"FixedTicker\"},{\"attributes\":{},\"id\":\"1022\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1021\",\"type\":\"PanTool\"},{\"attributes\":{\"axis\":{\"id\":\"1013\"},\"grid_line_color\":null,\"ticker\":null},\"id\":\"1016\",\"type\":\"Grid\"},{\"attributes\":{\"overlay\":{\"id\":\"1027\"}},\"id\":\"1023\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1071\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1024\",\"type\":\"SaveTool\"},{\"attributes\":{\"data_source\":{\"id\":\"1040\"},\"glyph\":{\"id\":\"1041\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1042\"},\"view\":{\"id\":\"1044\"}},\"id\":\"1043\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1072\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1025\",\"type\":\"ResetTool\"},{\"attributes\":{\"data\":{\"xs\":[[0.5,0.5],[1.5,1.5],[2.5,2.5],[3.5,3.5],[4.5,4.5]],\"ys\":[[-0.5,5.5],[-0.5,5.5],[-0.5,5.5],[-0.5,5.5],[-0.5,5.5]]},\"selected\":{\"id\":\"1074\"},\"selection_policy\":{\"id\":\"1073\"}},\"id\":\"1040\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"line_color\":{\"value\":\"lightgray\"},\"line_width\":{\"value\":2},\"xs\":{\"field\":\"xs\"},\"ys\":{\"field\":\"ys\"}},\"id\":\"1036\",\"type\":\"MultiLine\"},{\"attributes\":{\"data\":{\"xs\":[[-0.5,4.5],[-0.5,4.5],[-0.5,4.5],[-0.5,4.5],[-0.5,4.5],[-0.5,4.5]],\"ys\":[[0.5,0.5],[1.5,1.5],[2.5,2.5],[3.5,3.5],[4.5,4.5],[5.5,5.5]]},\"selected\":{\"id\":\"1072\"},\"selection_policy\":{\"id\":\"1071\"}},\"id\":\"1035\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"1035\"}},\"id\":\"1039\",\"type\":\"CDSView\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"lightgray\"},\"line_width\":{\"value\":2},\"xs\":{\"field\":\"xs\"},\"ys\":{\"field\":\"ys\"}},\"id\":\"1037\",\"type\":\"MultiLine\"},{\"attributes\":{\"line_color\":{\"value\":\"lightgray\"},\"line_width\":{\"value\":2},\"xs\":{\"field\":\"xs\"},\"ys\":{\"field\":\"ys\"}},\"id\":\"1041\",\"type\":\"MultiLine\"},{\"attributes\":{\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"lightgray\"},\"line_width\":{\"value\":2},\"xs\":{\"field\":\"xs\"},\"ys\":{\"field\":\"ys\"}},\"id\":\"1042\",\"type\":\"MultiLine\"},{\"attributes\":{\"source\":{\"id\":\"1059\"},\"text\":{\"field\":\"value\"},\"text_align\":{\"value\":\"center\"},\"text_baseline\":{\"value\":\"middle\"},\"text_color\":{\"field\":\"color\"},\"text_font_size\":{\"value\":\"9pt\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1060\",\"type\":\"LabelSet\"},{\"attributes\":{},\"id\":\"1011\",\"type\":\"LinearScale\"}],\"root_ids\":[\"1003\"]},\"title\":\"Bokeh Application\",\"version\":\"2.3.2\"}};\n", " var render_items = [{\"docid\":\"44b66690-7e77-4528-b080-beb69e96c4ec\",\"root_ids\":[\"1003\"],\"roots\":{\"1003\":\"eaec2edf-e707-40ed-8e14-64f9e1412ead\"}}];\n", " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", "\n", " }\n", " if (root.Bokeh !== undefined) {\n", " embed_document(root);\n", " } else {\n", " var attempts = 0;\n", " var timer = setInterval(function(root) {\n", " if (root.Bokeh !== undefined) {\n", " clearInterval(timer);\n", " embed_document(root);\n", " } else {\n", " attempts++;\n", " if (attempts > 100) {\n", " clearInterval(timer);\n", " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", " }\n", " }\n", " }, 10, root)\n", " }\n", "})(window);" ], "application/vnd.bokehjs_exec.v0+json": "" }, "metadata": { "application/vnd.bokehjs_exec.v0+json": { "id": "1003" } }, "output_type": "display_data" } ], "source": [ "solution" ] }, { "cell_type": "markdown", "id": "3527471a-cdd9-4ae9-9f46-81b8a642aab8", "metadata": {}, "source": [ "# Problems and Alphabets" ] }, { "cell_type": "markdown", "id": "b0d1e25f-986e-4170-8a6b-80215a6946b5", "metadata": {}, "source": [ "### Arbitrary Alphabets" ] }, { "cell_type": "markdown", "id": "9e1caf26-b24c-4e43-a3e6-39c984891ea1", "metadata": {}, "source": [ "One important feature of `pyalign` is that it does *not* rely on letters or characters as elements of the sequences that are going to be aligned, but works with literally any (comparable) Python object. The latter only needs to know about equality and hash values - i.e. it needs to have `__eq__` and `__hash__` methods.\n", "\n", "This means that sequences are not restricted to consist of elements that are represented in a fixed encoding with a very limited maximum size, such as 7-bit ASCII.\n", "\n", "The following section illustrates this by generating sequences of graphical shape objects that we then align to each other. We use the `shapyter` utility library to create shapes. Each shape you see in this section is a graphical representation of an `shapyter.Shape` instance and its attributes - namely form and color.\n", "\n", "`shapyter` allows us to create shape instances that are rendered in Jupyter:" ] }, { "cell_type": "code", "execution_count": 9, "id": "4f89410e-1a2e-4587-a811-b3ec376ab4ab", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", "\n", " \n", "\n", " \n", " \n", " " ], "text/plain": [ "Triangle(\"white\")" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#import importlib\n", "#importlib.reload(shapyter)\n", "\n", "import shapyter\n", "shapyter.Triangle()" ] }, { "cell_type": "markdown", "id": "b146d29e-1d71-4f08-afe3-5bddc83b2598", "metadata": {}, "source": [ "Shapes are differentiated by their form and color:" ] }, { "cell_type": "code", "execution_count": 10, "id": "f64a7ed1-836c-4767-af52-f349d0bbb8c1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "[\n", " \n", " \n", "\n", " \n", "\n", " \n", " \n", " , \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " ]" ], "text/plain": [ "[Triangle(\"yellow\"), Circle(\"#F010F0\")]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "shapyter.List([shapyter.Triangle(\"yellow\"), shapyter.Circle(\"#F010F0\")])" ] }, { "cell_type": "markdown", "id": "5ca43717-4a98-4859-b935-461f084d2630", "metadata": {}, "source": [ "`shapyter` also comes with `Shapifier` that maps letters to shapes. As stated above, our goal here is to stop working with letters and instead use general object instances. The mapping we obtain from one `Shapifier` is consistent, e.g. \"A\" is always mapped to a reddish circle." ] }, { "cell_type": "code", "execution_count": 11, "id": "da592d3a-c960-43ba-a671-d6f7cfe13bbb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "[\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " , \n", " \n", " \n", "\n", " \n", "\n", " \n", " \n", " , \n", " \n", " \n", "\n", " \n", "\n", " \n", " \n", " ]" ], "text/plain": [ "[Circle(\"#F8C3B6\"), Square(\"#F8C3B6\"), Square(\"#F8C3B6\")]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "shapifier = shapyter.Shapifier(30)\n", "shapifier(\"AGG\")" ] }, { "cell_type": "markdown", "id": "d50292a9-6e54-4471-98f3-8b53bf24b848", "metadata": {}, "source": [ "We can map arbitrary letter sequences to shapes consistently:" ] }, { "cell_type": "code", "execution_count": 12, "id": "e8674316-a73a-4edc-8a1c-aff1058afda6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "[\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " , \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " , \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " , \n", " \n", " \n", "\n", " \n", "\n", " \n", " \n", " , \n", " \n", " \n", "\n", " \n", "\n", " \n", " \n", " ]" ], "text/plain": [ "[Circle(\"#F8C3B6\"),\n", " Circle(\"#F8C3B6\"),\n", " Circle(\"#AEBC6E\"),\n", " Triangle(\"#86AED1\"),\n", " Square(\"#F8C3B6\")]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "shapifier(\"AATCG\")" ] }, { "cell_type": "code", "execution_count": 13, "id": "460008ad-640c-4b75-832a-3525b06ebfca", "metadata": {}, "outputs": [ { "data": { "text/html": [ "[\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " , \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " , \n", " \n", " \n", "\n", " \n", "\n", " \n", " \n", " , \n", " \n", " \n", "\n", " \n", "\n", " \n", " \n", " ]" ], "text/plain": [ "[Circle(\"#F8C3B6\"), Circle(\"#F8C3B6\"), Triangle(\"#86AED1\"), Square(\"#F8C3B6\")]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "shapifier(\"AACG\")" ] }, { "cell_type": "markdown", "id": "42c85567-c1ad-40b3-9430-4b3f73be034e", "metadata": {}, "source": [ "Remember that - independent of the rendering and the animation effects above - we are dealing with lists of generic object instances on a Python level. The list of shapes above therefore is really the following list of (comparable) instances:" ] }, { "cell_type": "code", "execution_count": 14, "id": "479824a1-cec0-4c41-bd15-8dfe0329159f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'[Circle(\"#F8C3B6\"), Circle(\"#F8C3B6\"), Triangle(\"#86AED1\"), Square(\"#F8C3B6\")]'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "repr(shapifier(\"AACG\"))" ] }, { "cell_type": "markdown", "id": "4084a32f-d149-44b1-b576-41a93119d568", "metadata": {}, "source": [ "We are now ready to demonstrate that `pyalign` really does not care about the element type we pass it. Instead of strings - i.e. sequences of letters - we pass in lists of `Shape` objects." ] }, { "cell_type": "code", "execution_count": 15, "id": "828e8dad-47f9-4369-898b-c41ac66bf89d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "\n", " \n", " \n", "
|| ||
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "\n", " \n", " \n", "
\n", "\t\t\t" ], "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "shape_problem_set = pyalign.problems.alphabetic(\n", " shapifier(\"AGCT\"),\n", " lambda x, y: 1 if x == y else -1,\n", " direction=\"maximize\")\n", "\n", "shape_problem = shape_problem_set.new_problem(\n", " shapifier(\"AATCG\"), shapifier(\"AACG\"))\n", "\n", "solver.solve(shape_problem).alignment" ] }, { "cell_type": "markdown", "id": "c3bc41cf-a8a5-4b0f-aa2a-2a700481cec2", "metadata": {}, "source": [ "### Large Alphabets" ] }, { "cell_type": "code", "execution_count": 23, "id": "15328ded-d693-48ea-8a13-d7c088f08043", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import shapyter\n", "import importlib\n", "importlib.reload(shapyter)" ] }, { "cell_type": "markdown", "id": "a05c6b6c-7802-4e90-95ff-56b7a9017cc9", "metadata": {}, "source": [ "In some use cases, the size of the alphabet is too large to specify. For these cases `pyalign` offers a different way to formulate problems without specifying an explicit alphabet.\n", "\n", "For example, let us consider alignments over sequences whose elements are sets $S_1, ..., S_k$ and the similarity measure between two element sets is the Jaccard similarity. If the sets are drawn from $n$ elements, i.e. $S_i \\in 2^{\\{1, ..., n\\}}$, then the alphabet we draw from has $2^ n$ elements, which is infeasible to specify for larger $n$. For these cases, we use `pyalign.problems.general` to specify the problem without an explicit alphabet." ] }, { "cell_type": "markdown", "id": "045b5c73-4006-41be-aa0f-1cdb9b660cae", "metadata": {}, "source": [ "\n", "\n", "
If you can specify a small alphabet, problems created through `alphabetic` are solved faster than those specified through `general`. Therefore always check that your problem ist not suitable for `alphabetic` first.
" ] }, { "cell_type": "markdown", "id": "c26fe1be-1053-47ca-836b-5cdc73305057", "metadata": {}, "source": [ "Let us illustrate the example from above with $n=9$. We visualize sets as a matrix of of indices, where black squares indicate that the $i$-th element is in the set." ] }, { "cell_type": "code", "execution_count": 24, "id": "ad7ece26-dad3-47cb-b6d1-46aabd60ecfe", "metadata": {}, "outputs": [ { "data": { "text/html": [ "[\n", " \n", " \n", " \n", " , \n", " \n", " \n", " \n", " , \n", " \n", " \n", " \n", " , \n", " \n", " \n", " \n", " ]" ], "text/plain": [ "[,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sets = shapyter.List([\n", " shapyter.Set(100, 9),\n", " shapyter.Set(101, 9),\n", " shapyter.Set(50, 9),\n", " shapyter.Set(211, 9)\n", "])\n", "sets" ] }, { "cell_type": "markdown", "id": "51f8a445-768d-4906-8685-069dae1c7f01", "metadata": {}, "source": [ "We can compute the Jaccard similarity between these sets:" ] }, { "cell_type": "code", "execution_count": 25, "id": "01ba7a26-2b14-4979-91e4-383b7787f482", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.3333333333333333" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sets[0].jaccard(sets[1])" ] }, { "cell_type": "code", "execution_count": 26, "id": "cf5c644a-0023-4535-be8d-5ad1e82071f4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.1111111111111111" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sets[1].jaccard(sets[2])" ] }, { "cell_type": "markdown", "id": "7f1a2ad6-7480-4708-be35-1e3cc964ec91", "metadata": {}, "source": [ "Using these sets as elements, we can now use `general` to pose an alignment problem without an alphabet:" ] }, { "cell_type": "code", "execution_count": 27, "id": "6559ec24-22c9-40f9-8f54-593cfb6915ed", "metadata": {}, "outputs": [], "source": [ "jaccard_problem_set = pyalign.problems.general(\n", " lambda x, y: x.jaccard(y),\n", " direction=\"maximize\")" ] }, { "cell_type": "code", "execution_count": 28, "id": "35f9c5d3-55e3-4c6a-aab2-a2c4b6a00dfd", "metadata": {}, "outputs": [], "source": [ "jaccard_problem = jaccard_problem_set.new_problem(\n", " [sets[i] for i in (0, 2, 1, 0, 2)],\n", " [sets[i] for i in (1, 1, 2, 2)],\n", ")" ] }, { "cell_type": "code", "execution_count": 29, "id": "c665973c-8bc0-4fed-880c-0917120e6a92", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
| |||
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "\t\t\t" ], "text/plain": [ "" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "solver.solve(jaccard_problem).alignment" ] }, { "cell_type": "markdown", "id": "da639f90-2cde-491e-98aa-a9aa5bc74751", "metadata": {}, "source": [ "# Gaps and Locality" ] }, { "cell_type": "markdown", "id": "cd32ba75-32a3-4be3-bbd8-24a4d75be75b", "metadata": {}, "source": [ "### Gaps and Gap Costs" ] }, { "cell_type": "markdown", "id": "a1b92025-b50c-4295-8422-60cb8d037af8", "metadata": {}, "source": [ "`pyalign` offers various configurable standard models of gap costs as well as custom gap costs driven by user functions.\n", "\n", "\n", "\n", "
\n", "`pyalign` chooses algorithms automatically. You just pass the desired gap cost function to the `Solver` and `pyalign` will internally pick the optimal implemented algorithm to use. For example, if you pick affine gap costs, you will get Gotoh's algorithm and O(n^2) runtime. If you configure logarithmic gap cost, `pyalign` will internally use a O(n^3) solver.
" ] }, { "cell_type": "code", "execution_count": 18, "id": "aa4c2ed5-ca8f-440d-a2df-e566afac9ade", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t
AAT CG
||
AACG
\n", "\t\t\t" ], "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pyalign.solve\n", "import pyalign.gaps\n", "\n", "solver_g = pyalign.solve.LocalSolver(\n", " gap_cost=pyalign.gaps.AffineGapCost(1, 2))\n", "\n", "solver_g.solve(problem).alignment" ] }, { "cell_type": "markdown", "id": "aca2a913-52b3-48cf-bb60-583495ba58a4", "metadata": {}, "source": [ "Here is a plot of the configured affine gap cost in terms of gap length:" ] }, { "cell_type": "code", "execution_count": 19, "id": "00fe8618-658a-4f4a-9c91-fe23e9392f3c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "
\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "(function(root) {\n", " function embed_document(root) {\n", " \n", " var docs_json = {\"0387587c-8488-4168-937f-b059993d85cc\":{\"defs\":[],\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1305\"}],\"center\":[{\"id\":\"1308\"},{\"id\":\"1312\"}],\"height\":200,\"left\":[{\"id\":\"1309\"}],\"renderers\":[{\"id\":\"1330\"},{\"id\":\"1338\"}],\"title\":\"w(k) = 1.0 + 2.0 * k, w(0) = 0\",\"toolbar\":{\"id\":\"1320\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1297\"},\"x_scale\":{\"id\":\"1301\"},\"y_range\":{\"id\":\"1299\"},\"y_scale\":{\"id\":\"1303\"}},\"id\":\"1296\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"angle\":{\"field\":\"angle\"},\"line_dash\":{\"value\":\"dotted\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1336\",\"type\":\"Ray\"},{\"attributes\":{\"data_source\":{\"id\":\"1327\"},\"glyph\":{\"id\":\"1328\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1329\"},\"view\":{\"id\":\"1331\"}},\"id\":\"1330\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1371\",\"type\":\"AllLabels\"},{\"attributes\":{},\"id\":\"1373\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1297\",\"type\":\"DataRange1d\"},{\"attributes\":{\"ticks\":[0,1,2,3,4]},\"id\":\"1333\",\"type\":\"FixedTicker\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"syncable\":false,\"top_units\":\"screen\"},\"id\":\"1319\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"active_multi\":null,\"tools\":[{\"id\":\"1313\"},{\"id\":\"1314\"},{\"id\":\"1315\"},{\"id\":\"1316\"},{\"id\":\"1317\"},{\"id\":\"1318\"}]},\"id\":\"1320\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1310\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis\":{\"id\":\"1309\"},\"dimension\":1,\"ticker\":null},\"id\":\"1312\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1303\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1301\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1318\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"1377\",\"type\":\"Selection\"},{\"attributes\":{\"data\":{\"angle\":[0],\"x\":[0],\"y\":[1]},\"selected\":{\"id\":\"1377\"},\"selection_policy\":{\"id\":\"1378\"}},\"id\":\"1335\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1378\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"axis_label\":\"cost\",\"formatter\":{\"id\":\"1373\"},\"major_label_policy\":{\"id\":\"1374\"},\"ticker\":{\"id\":\"1310\"}},\"id\":\"1309\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis_label\":\"gap length\",\"formatter\":{\"id\":\"1370\"},\"major_label_policy\":{\"id\":\"1371\"},\"ticker\":{\"id\":\"1333\"}},\"id\":\"1305\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1374\",\"type\":\"AllLabels\"},{\"attributes\":{\"data\":{\"x\":[0,1,2,3,4],\"y\":{\"__ndarray__\":\"AAAAAAAAQEAAAKBAAADgQAAAEEE=\",\"dtype\":\"float32\",\"order\":\"little\",\"shape\":[5]}},\"selected\":{\"id\":\"1375\"},\"selection_policy\":{\"id\":\"1376\"}},\"id\":\"1327\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1314\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1313\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1316\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1317\",\"type\":\"ResetTool\"},{\"attributes\":{\"overlay\":{\"id\":\"1319\"}},\"id\":\"1315\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1299\",\"type\":\"DataRange1d\"},{\"attributes\":{\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1328\",\"type\":\"Line\"},{\"attributes\":{},\"id\":\"1370\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"source\":{\"id\":\"1327\"}},\"id\":\"1331\",\"type\":\"CDSView\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1329\",\"type\":\"Line\"},{\"attributes\":{\"axis\":{\"id\":\"1305\"},\"ticker\":null},\"id\":\"1308\",\"type\":\"Grid\"},{\"attributes\":{\"data_source\":{\"id\":\"1335\"},\"glyph\":{\"id\":\"1336\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1337\"},\"view\":{\"id\":\"1339\"}},\"id\":\"1338\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1375\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1376\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"angle\":{\"field\":\"angle\"},\"line_alpha\":{\"value\":0.1},\"line_dash\":{\"value\":\"dotted\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1337\",\"type\":\"Ray\"},{\"attributes\":{\"source\":{\"id\":\"1335\"}},\"id\":\"1339\",\"type\":\"CDSView\"}],\"root_ids\":[\"1296\"]},\"title\":\"Bokeh Application\",\"version\":\"2.3.2\"}};\n", " var render_items = [{\"docid\":\"0387587c-8488-4168-937f-b059993d85cc\",\"root_ids\":[\"1296\"],\"roots\":{\"1296\":\"7a0105d5-33e7-4372-9e8f-93adba0b6f66\"}}];\n", " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", "\n", " }\n", " if (root.Bokeh !== undefined) {\n", " embed_document(root);\n", " } else {\n", " var attempts = 0;\n", " var timer = setInterval(function(root) {\n", " if (root.Bokeh !== undefined) {\n", " clearInterval(timer);\n", " embed_document(root);\n", " } else {\n", " attempts++;\n", " if (attempts > 100) {\n", " clearInterval(timer);\n", " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", " }\n", " }\n", " }, 10, root)\n", " }\n", "})(window);" ], "application/vnd.bokehjs_exec.v0+json": "" }, "metadata": { "application/vnd.bokehjs_exec.v0+json": { "id": "1296" } }, "output_type": "display_data" } ], "source": [ "solver_g.gap_cost" ] }, { "cell_type": "markdown", "id": "dc0d592c-2f5e-4cb5-a80f-b8daecf9af67", "metadata": {}, "source": [ "Here we configure a constant gap cost independent of length $k$, given $k \\ge 1$:" ] }, { "cell_type": "code", "execution_count": 20, "id": "1927d2d5-6b0b-403d-9673-82f674c43a70", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "
\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "(function(root) {\n", " function embed_document(root) {\n", " \n", " var docs_json = {\"5a170a59-a7b8-4486-b9e4-4cb07e17a91f\":{\"defs\":[],\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1444\"}],\"center\":[{\"id\":\"1447\"},{\"id\":\"1451\"}],\"height\":200,\"left\":[{\"id\":\"1448\"}],\"renderers\":[{\"id\":\"1469\"},{\"id\":\"1477\"}],\"title\":\"w(k) = 0.3, w(0) = 0\",\"toolbar\":{\"id\":\"1459\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1436\"},\"x_scale\":{\"id\":\"1440\"},\"y_range\":{\"id\":\"1438\"},\"y_scale\":{\"id\":\"1442\"}},\"id\":\"1435\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"end\":1},\"id\":\"1438\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1440\",\"type\":\"LinearScale\"},{\"attributes\":{\"data_source\":{\"id\":\"1466\"},\"glyph\":{\"id\":\"1467\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1468\"},\"view\":{\"id\":\"1470\"}},\"id\":\"1469\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis_label\":\"gap length\",\"formatter\":{\"id\":\"1520\"},\"major_label_policy\":{\"id\":\"1521\"},\"ticker\":{\"id\":\"1472\"}},\"id\":\"1444\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1436\",\"type\":\"DataRange1d\"},{\"attributes\":{\"axis_label\":\"cost\",\"formatter\":{\"id\":\"1523\"},\"major_label_policy\":{\"id\":\"1524\"},\"ticker\":{\"id\":\"1449\"}},\"id\":\"1448\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1457\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"1442\",\"type\":\"LinearScale\"},{\"attributes\":{\"axis\":{\"id\":\"1444\"},\"ticker\":null},\"id\":\"1447\",\"type\":\"Grid\"},{\"attributes\":{\"angle\":{\"field\":\"angle\"},\"line_alpha\":{\"value\":0.1},\"line_dash\":{\"value\":\"dotted\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1476\",\"type\":\"Ray\"},{\"attributes\":{},\"id\":\"1525\",\"type\":\"Selection\"},{\"attributes\":{\"axis\":{\"id\":\"1448\"},\"dimension\":1,\"ticker\":null},\"id\":\"1451\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1526\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1449\",\"type\":\"BasicTicker\"},{\"attributes\":{\"data\":{\"x\":[0,1,2,3,4],\"y\":{\"__ndarray__\":\"AAAAAJqZmT6amZk+mpmZPpqZmT4=\",\"dtype\":\"float32\",\"order\":\"little\",\"shape\":[5]}},\"selected\":{\"id\":\"1525\"},\"selection_policy\":{\"id\":\"1526\"}},\"id\":\"1466\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"active_multi\":null,\"tools\":[{\"id\":\"1452\"},{\"id\":\"1453\"},{\"id\":\"1454\"},{\"id\":\"1455\"},{\"id\":\"1456\"},{\"id\":\"1457\"}]},\"id\":\"1459\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1453\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1452\",\"type\":\"PanTool\"},{\"attributes\":{\"overlay\":{\"id\":\"1458\"}},\"id\":\"1454\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1455\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1456\",\"type\":\"ResetTool\"},{\"attributes\":{\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1467\",\"type\":\"Line\"},{\"attributes\":{},\"id\":\"1527\",\"type\":\"Selection\"},{\"attributes\":{\"source\":{\"id\":\"1466\"}},\"id\":\"1470\",\"type\":\"CDSView\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1468\",\"type\":\"Line\"},{\"attributes\":{},\"id\":\"1528\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1523\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1521\",\"type\":\"AllLabels\"},{\"attributes\":{\"angle\":{\"field\":\"angle\"},\"line_dash\":{\"value\":\"dotted\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1475\",\"type\":\"Ray\"},{\"attributes\":{\"source\":{\"id\":\"1474\"}},\"id\":\"1478\",\"type\":\"CDSView\"},{\"attributes\":{\"data_source\":{\"id\":\"1474\"},\"glyph\":{\"id\":\"1475\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1476\"},\"view\":{\"id\":\"1478\"}},\"id\":\"1477\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1524\",\"type\":\"AllLabels\"},{\"attributes\":{\"data\":{\"angle\":[0],\"x\":[0],\"y\":[1]},\"selected\":{\"id\":\"1527\"},\"selection_policy\":{\"id\":\"1528\"}},\"id\":\"1474\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"ticks\":[0,1,2,3,4]},\"id\":\"1472\",\"type\":\"FixedTicker\"},{\"attributes\":{},\"id\":\"1520\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"syncable\":false,\"top_units\":\"screen\"},\"id\":\"1458\",\"type\":\"BoxAnnotation\"}],\"root_ids\":[\"1435\"]},\"title\":\"Bokeh Application\",\"version\":\"2.3.2\"}};\n", " var render_items = [{\"docid\":\"5a170a59-a7b8-4486-b9e4-4cb07e17a91f\",\"root_ids\":[\"1435\"],\"roots\":{\"1435\":\"32c10d3f-ad89-402c-96e3-ea2023912656\"}}];\n", " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", "\n", " }\n", " if (root.Bokeh !== undefined) {\n", " embed_document(root);\n", " } else {\n", " var attempts = 0;\n", " var timer = setInterval(function(root) {\n", " if (root.Bokeh !== undefined) {\n", " clearInterval(timer);\n", " embed_document(root);\n", " } else {\n", " attempts++;\n", " if (attempts > 100) {\n", " clearInterval(timer);\n", " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", " }\n", " }\n", " }, 10, root)\n", " }\n", "})(window);" ], "application/vnd.bokehjs_exec.v0+json": "" }, "metadata": { "application/vnd.bokehjs_exec.v0+json": { "id": "1435" } }, "output_type": "display_data" } ], "source": [ "pyalign.gaps.ConstantGapCost(0.3)" ] }, { "cell_type": "markdown", "id": "2f5d5af7-628a-4d50-b735-f84b1604d3b1", "metadata": {}, "source": [ "As a final example here is how to configure a logarithmic gap cost:" ] }, { "cell_type": "code", "execution_count": 21, "id": "6fb7d917-18fb-41f4-9049-a6fc183e1eac", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "
\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "(function(root) {\n", " function embed_document(root) {\n", " \n", " var docs_json = {\"69e25620-25ee-4c4d-a5c0-8128da4b26f2\":{\"defs\":[],\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1594\"}],\"center\":[{\"id\":\"1597\"},{\"id\":\"1601\"}],\"height\":200,\"left\":[{\"id\":\"1598\"}],\"renderers\":[{\"id\":\"1619\"},{\"id\":\"1627\"}],\"title\":\"w(k) = 0.1 + 0.3 * ln(k), w(0) = 0\",\"toolbar\":{\"id\":\"1609\"},\"toolbar_location\":null,\"x_range\":{\"id\":\"1586\"},\"x_scale\":{\"id\":\"1590\"},\"y_range\":{\"id\":\"1588\"},\"y_scale\":{\"id\":\"1592\"}},\"id\":\"1585\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"overlay\":{\"id\":\"1608\"}},\"id\":\"1604\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1605\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1606\",\"type\":\"ResetTool\"},{\"attributes\":{\"ticks\":[0,1,2,3,4]},\"id\":\"1622\",\"type\":\"FixedTicker\"},{\"attributes\":{},\"id\":\"1685\",\"type\":\"AllLabels\"},{\"attributes\":{\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1617\",\"type\":\"Line\"},{\"attributes\":{\"source\":{\"id\":\"1616\"}},\"id\":\"1620\",\"type\":\"CDSView\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1618\",\"type\":\"Line\"},{\"attributes\":{\"angle\":{\"field\":\"angle\"},\"line_dash\":{\"value\":\"dotted\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1625\",\"type\":\"Ray\"},{\"attributes\":{},\"id\":\"1687\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"axis\":{\"id\":\"1598\"},\"dimension\":1,\"ticker\":null},\"id\":\"1601\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1681\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":0.5,\"fill_color\":\"lightgrey\",\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":1.0,\"line_color\":\"black\",\"line_dash\":[4,4],\"line_width\":2,\"right_units\":\"screen\",\"syncable\":false,\"top_units\":\"screen\"},\"id\":\"1608\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"data_source\":{\"id\":\"1616\"},\"glyph\":{\"id\":\"1617\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1618\"},\"view\":{\"id\":\"1620\"}},\"id\":\"1619\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1586\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1682\",\"type\":\"AllLabels\"},{\"attributes\":{},\"id\":\"1603\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1686\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1688\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1590\",\"type\":\"LinearScale\"},{\"attributes\":{\"data_source\":{\"id\":\"1624\"},\"glyph\":{\"id\":\"1625\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1626\"},\"view\":{\"id\":\"1628\"}},\"id\":\"1627\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1689\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"end\":1},\"id\":\"1588\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1592\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1607\",\"type\":\"HelpTool\"},{\"attributes\":{\"axis_label\":\"cost\",\"formatter\":{\"id\":\"1684\"},\"major_label_policy\":{\"id\":\"1685\"},\"ticker\":{\"id\":\"1599\"}},\"id\":\"1598\",\"type\":\"LinearAxis\"},{\"attributes\":{\"axis_label\":\"gap length\",\"formatter\":{\"id\":\"1681\"},\"major_label_policy\":{\"id\":\"1682\"},\"ticker\":{\"id\":\"1622\"}},\"id\":\"1594\",\"type\":\"LinearAxis\"},{\"attributes\":{\"angle\":{\"field\":\"angle\"},\"line_alpha\":{\"value\":0.1},\"line_dash\":{\"value\":\"dotted\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1626\",\"type\":\"Ray\"},{\"attributes\":{\"axis\":{\"id\":\"1594\"},\"ticker\":null},\"id\":\"1597\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"1624\"}},\"id\":\"1628\",\"type\":\"CDSView\"},{\"attributes\":{\"data\":{\"angle\":[0],\"x\":[0],\"y\":[1]},\"selected\":{\"id\":\"1688\"},\"selection_policy\":{\"id\":\"1689\"}},\"id\":\"1624\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"active_multi\":null,\"tools\":[{\"id\":\"1602\"},{\"id\":\"1603\"},{\"id\":\"1604\"},{\"id\":\"1605\"},{\"id\":\"1606\"},{\"id\":\"1607\"}]},\"id\":\"1609\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1599\",\"type\":\"BasicTicker\"},{\"attributes\":{\"data\":{\"x\":[0,1,2,3,4],\"y\":{\"__ndarray__\":\"AAAAAM3MzD3cqp0+ZfLbPkIRBD8=\",\"dtype\":\"float32\",\"order\":\"little\",\"shape\":[5]}},\"selected\":{\"id\":\"1686\"},\"selection_policy\":{\"id\":\"1687\"}},\"id\":\"1616\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1684\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1602\",\"type\":\"PanTool\"}],\"root_ids\":[\"1585\"]},\"title\":\"Bokeh Application\",\"version\":\"2.3.2\"}};\n", " var render_items = [{\"docid\":\"69e25620-25ee-4c4d-a5c0-8128da4b26f2\",\"root_ids\":[\"1585\"],\"roots\":{\"1585\":\"10dfbdeb-e3ed-4acc-befb-0a8aefb97dd8\"}}];\n", " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", "\n", " }\n", " if (root.Bokeh !== undefined) {\n", " embed_document(root);\n", " } else {\n", " var attempts = 0;\n", " var timer = setInterval(function(root) {\n", " if (root.Bokeh !== undefined) {\n", " clearInterval(timer);\n", " embed_document(root);\n", " } else {\n", " attempts++;\n", " if (attempts > 100) {\n", " clearInterval(timer);\n", " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", " }\n", " }\n", " }, 10, root)\n", " }\n", "})(window);" ], "application/vnd.bokehjs_exec.v0+json": "" }, "metadata": { "application/vnd.bokehjs_exec.v0+json": { "id": "1585" } }, "output_type": "display_data" } ], "source": [ "pyalign.gaps.LogarithmicGapCost(0.1, 0.3)" ] }, { "cell_type": "markdown", "id": "324690f3-9703-4a5f-a6ff-60a614e068a9", "metadata": {}, "source": [ "### Local, Global and Semiglobal Solvers" ] }, { "cell_type": "code", "execution_count": 9, "id": "3f316e3b-4aab-484a-a126-927dfcc37a54", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "cca84177c7fa46c0bb83cae2ee8b237e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "GridBox(children=(Label(value='global'), HTML(value='\\n\\t\\t\\t 0:\n", " items.append(widgets.Label(\"\"))\n", " items.append(widgets.HTML(solution.alignment._repr_html_()))\n", " \n", " return widgets.GridBox(\n", " items, layout=widgets.Layout(grid_template_columns=\"repeat(2, 200px)\"))\n", " \n", "render_solvers(problem_set.new_problem(\n", " \"AATCG\",\n", " \"AACG\"\n", "))" ] }, { "cell_type": "markdown", "id": "16b62a4d-a78c-49f8-afdf-07d1367a3224", "metadata": {}, "source": [ "# Speeding up Computations" ] }, { "cell_type": "markdown", "id": "90286acb-5335-4110-8964-528de35d469e", "metadata": {}, "source": [ "### Codomain, Solutions, Alignments and Scores\n", "> \"only compute what you want to know\"" ] }, { "cell_type": "markdown", "id": "e9e51f47-b272-4b91-8454-1716892a0b20", "metadata": {}, "source": [ "In terms of the constants involved in a $O(n^2)$ or $O(n^3)$ runtime for solving an alignment problem, computing full `Solution` objects - consisting of the value matrix, traceback information and so on - is more expensive than only computing a single score - even though both share the same quadratic or cubic runtime in $n$. If you solve many problems with small $n$ these these two tasks can show a considerable difference in runtime.\n", "\n", "For common use cases, where you do not actually use the full solution data (e.g. the full traceback matrix), but only the alignment or even only the score, `pyalign` offers you to restrict the data type you want the `Solver` to return as result. This type is called *codomain* in `pyalign`, since it defines the codomain of the `Solver`'s `solve()` method.\n", "\n", "
\n", "
pyalign will internally create an optimized solver that calls optimized C++ code for the specific codomain you request. For example, if you request to compute only scores, pyalign will not compute any traceback information in the first place.
\n", "\n", "Starting with a generic `LocalSolver`, let us create a new `Solver` that will not generate full `Solution` objects, but instead return `Alignment` instances. We can achieve this by calling `to_codomain` with the desired target codomain." ] }, { "cell_type": "code", "execution_count": 9, "id": "21862ed5-5ba5-4030-8b38-6f342ca15930", "metadata": {}, "outputs": [], "source": [ "solver_1 = pyalign.solve.LocalSolver(\n", " gap_cost=pyalign.gaps.LinearGapCost(2))" ] }, { "cell_type": "code", "execution_count": 10, "id": "8e1c7ce5-98fe-4dae-a9e2-5da19cbb0651", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t\n", "\t\t\t
AAT CG
||
AACG
\n", "\t\t\t" ], "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "solver_2 = solver_1.to_codomain(pyalign.solve.Alignment)\n", "solver_2.solve(problem)" ] }, { "cell_type": "markdown", "id": "90a2738c-a951-41cc-bc57-60f93dc5b252", "metadata": {}, "source": [ "Similarly, we can create a `Solver` that only returns scores." ] }, { "cell_type": "code", "execution_count": 11, "id": "f818517e-6b96-4494-b944-b1215d23f89d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "solver_3 = solver_1.to_codomain(pyalign.solve.Score)\n", "solver_3.solve(problem)" ] }, { "cell_type": "markdown", "id": "50871ca2-9678-4966-9188-2928e5d2012a", "metadata": {}, "source": [ "To illustrate the difference in runtimes, here is a quick benchmark." ] }, { "cell_type": "markdown", "id": "36b1c7a3-ee18-4248-9e9a-a563140035ee", "metadata": {}, "source": [ "Another important facette of codomain control is setting the number of returned results. By default, only *one* optimal solution or alignment gets computed.\n", "\n", "However, using codomain specificationas, you can instruct `Solver` to return all optimal scores, alignments or solutions. To enable this, set the codomain to an `Iterator` of those types. With the returned `Iterator` it is then possible to iterate over *all* optimal solutions of the problem." ] }, { "cell_type": "code", "execution_count": 12, "id": "1421d1e7-0149-446a-bfe4-27c159028ea8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[2.0, 2.0]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from typing import Iterator\n", "\n", "solver_4 = solver_1.to_codomain(Iterator[pyalign.solve.Alignment])\n", "[x.score for x in solver_4.solve(problem)]" ] }, { "cell_type": "markdown", "id": "837a491d-1c87-43d0-8d05-bb79f0c3343d", "metadata": {}, "source": [ "It is also possible to obtain `List`s of items. `List`s might be easier to work with than `Iterator`s. `Iterator`s on the other hand create the results lazily and on demand. This means `Iterator`s consume only $O(1)$ memory, whereas `List`s might contain $O(n^2)$ elements for some problems." ] }, { "cell_type": "code", "execution_count": 13, "id": "1fa3efd4-bc44-4075-b5ea-72ce63605ae5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from typing import List\n", "\n", "solver_4 = solver_1.to_codomain(List[pyalign.solve.Alignment])\n", "solver_4.solve(problem)" ] }, { "cell_type": "markdown", "id": "bd385aee-8672-4860-946d-c789a3685d23", "metadata": {}, "source": [ "### Batches\n", "> \"compute x for the price of one\"" ] }, { "cell_type": "markdown", "id": "5c4fa82d-ad11-4018-a311-9187d72318e7", "metadata": {}, "source": [ "Computation of alignments can be sped up with data level parallelism in modern CPUs (i.e. SIMD instructions). In order to make use of these, problems need to be given to `pyalign` in lists, i.e. not one at a time.\n", "\n", "\n", "
\n", "Batches incur some overhead and the speed-up is more noticeable with longer sequences and batches that contain sequences of similar length.\n", "
" ] }, { "cell_type": "code", "execution_count": 62, "id": "c9a7d16c-8163-4c78-81ed-a73ff6655291", "metadata": {}, "outputs": [], "source": [ "import pyalign\n", "import random\n", "\n", "def generate_problems(n, a=5, b=15):\n", " '''\n", " generate n problems, each with random sequences\n", " of length between a and b\n", " '''\n", " \n", " alphabet = \"ABCD\"\n", " \n", " def gen_seq(k):\n", " return \"\".join(random.choices(alphabet, k=k))\n", "\n", " problem_set = pyalign.problems.alphabetic(\n", " alphabet,\n", " lambda x, y: 1 if x == y else -1,\n", " direction=\"maximize\")\n", "\n", " problems = []\n", " for _ in range(n):\n", " problems.append(problem_set.new_problem(\n", " gen_seq(random.randint(a, b)),\n", " gen_seq(random.randint(a, b))))\n", " \n", " return problems" ] }, { "cell_type": "markdown", "id": "0f245514-03da-4143-be80-5278dbebae27", "metadata": {}, "source": [ "We generate a list of problems and a solver that returns scores:" ] }, { "cell_type": "code", "execution_count": null, "id": "7d8d2e69-b172-4b1f-bc6f-e44217a23c33", "metadata": {}, "outputs": [], "source": [ "some_problems = generate_problems(10)\n", "score_solver = pyalign.solve.LocalSolver(codomain=pyalign.solve.Score)" ] }, { "cell_type": "markdown", "id": "e698e4f0-35e3-4202-bd96-45c9407450b7", "metadata": {}, "source": [ "Solving the problems through single calls to `solve` vs. using a single batched call returns the same results:" ] }, { "cell_type": "code", "execution_count": 63, "id": "2a9b97ae-f94b-4354-a479-5f6af1ce9913", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[4.0, 6.0, 2.0, 4.0, 3.0, 3.0, 8.0, 2.0, 4.0, 6.0]" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[score_solver.solve(p) for p in some_problems]" ] }, { "cell_type": "code", "execution_count": 64, "id": "4f55f699-d8d4-4a06-8017-66af752941c9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[4.0, 6.0, 2.0, 4.0, 3.0, 3.0, 8.0, 2.0, 4.0, 6.0]" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "score_solver.solve(some_problems)" ] }, { "cell_type": "markdown", "id": "352f25dd-1372-43ce-bf9b-2db645643700", "metadata": {}, "source": [ "As demonstrated below, the runtimes are different though:" ] }, { "cell_type": "code", "execution_count": 65, "id": "50120060-af5a-4690-a224-72fdfffd5876", "metadata": {}, "outputs": [], "source": [ "import time\n", "\n", "def benchmark(f, title, n=1000):\n", " t0 = time.perf_counter_ns()\n", " for _ in range(n):\n", " r = f()\n", " t1 = time.perf_counter_ns()\n", " print(f\"{title}: time per solve: {(t1 - t0) / (n * 1000):.1f} μs\")" ] }, { "cell_type": "code", "execution_count": 66, "id": "4cd7ec65-dd52-4fbd-be6a-0d97665cca2d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "no batch: time per solve: 1092.9 μs\n" ] } ], "source": [ "benchmark(lambda: [score_solver.solve(p) for p in some_problems], \"no batch\")" ] }, { "cell_type": "code", "execution_count": 67, "id": "46372872-8395-4fc9-8f45-e9151475bb17", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "as batch: time per solve: 366.9 μs\n" ] } ], "source": [ "benchmark(lambda: score_solver.solve(some_problems), \"as batch\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }