{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# MinHashing all the things: a quick analysis of MAG search results\n", "\n", "Companion notebook for https://blog.luizirber.org/2020/07/23/mag-results/\n", "\n", "## Preparing some metadata" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mBuilding DAG of jobs...\u001b[0m\n", "\u001b[33mNothing to be done.\u001b[0m\n", "\u001b[33mComplete log: /home/luizirber/work/sourmash-bio/2020-07-22-mag-search/.snakemake/log/2020-07-24T144113.032349.snakemake.log\u001b[0m\n" ] } ], "source": [ "!snakemake -j1" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from ipywidgets import interactive\n", "import ipywidgets as widgets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Loading the data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " MAG metagenome containment\n", "89721 TOBG_NP-28 SRR2103020 0.500144\n", "53860 TOBG_NP-42 SRR7168048 0.500169\n", "28907 TOBG_EAC-55 SRR7479580 0.500182\n", "73862 TOBG_SP-43 SRR7986296 0.500218\n", "27250 TOBG_SAT-1356 SRR6713912 0.500264\n", "... ... ... ...\n", "60804 TOBG_RS-626 ERR4013358 0.988004\n", "66828 TOBG_SP-208 SRR5868539 0.989301\n", "66807 TOBG_SP-4095 SRR5868539 0.990476\n", "6268 TOBG_EAC-96 SRR8159436 0.992379\n", "66794 TOBG_NP-110 SRR5868539 0.993755\n", "\n", "[23644 rows x 3 columns]\n", "unique SRA runs: 6398\n" ] } ], "source": [ "import re\n", "\n", "from IPython.display import HTML\n", "import pandas as pd\n", "import numpy as np\n", "\n", "data = pd.read_csv(\"results.csv\",\n", " sep=\",\",\n", " quotechar=\"'\",\n", " names=[\"MAG\", \"metagenome\", \"containment\"])\n", "\n", "# These MAGs come from TARA Oceans, and some of them are in Parks 8k, so let's remove them from results\n", "tara_metag = pd.read_table(\"inputs/tara_runinfo.csv\", sep=\",\", header=0, usecols=[\"Run\"])\n", "parks_8k = pd.read_table(\"inputs/parks_runinfo.csv\", sep=\",\", header=0, usecols=[\"Run\"])\n", "\n", "# Fix names so it's easier to query\n", "data['MAG'] = data['MAG'].str.replace(r\"'(?P.*)'\", lambda m: m.group(\"id\"))\n", "data['metagenome'] = data['metagenome'].str.replace(r\".*/(?P.*).sig.*\", lambda m: m.group(\"id\"))\n", "\n", "# Actually remove TARA and Parks\n", "to_keep = set(data['metagenome'].values).difference(set(tara_metag[\"Run\"].values))\n", "to_keep = to_keep.difference(set(parks_8k[\"Run\"].values))\n", "filtered = data[data['metagenome'].isin(to_keep)]\n", "print(filtered[filtered['containment'] > 0.5].sort_values(by=\"containment\"))\n", "\n", "# MAGs metadata\n", "taxonomy = pd.read_excel(\"inputs/tully_mag_taxonomy.xlsx\", header=1).set_index(\"Genome ID\")\n", "stats = pd.read_excel(\"inputs/tully_mag_stats.xlsx\", header=1).set_index(\"Genome ID\")\n", "\n", "print(\"unique SRA runs: \", len(filtered['metagenome'].unique()))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2291" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(filtered[\"MAG\"].unique())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1407" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(filtered[filtered['containment'] > 0.5][\"MAG\"].unique())" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2938" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(filtered[filtered['containment'] > 0.5][\"metagenome\"].unique())" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "filtered = filtered[filtered['containment'] > 0.5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Picking a candidate: TOBG_NP-110, I choose you!" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e47472e827a64ee48196c6ab760ba550", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(Dropdown(description='MAG name', index=889, options=('TOBG_SP-231', 'TOBG_NAT-188', 'TOB…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def update_candidate(candidate): \n", " print(taxonomy.loc[candidate])\n", " print()\n", " print(stats.loc[candidate])\n", " print()\n", "\n", " filters = (\n", " (filtered[\"MAG\"] == candidate) &\n", " (filtered['containment'] > 0.5)\n", " )\n", " \n", " with_link = filtered.copy()\n", " with_link[\"metagenome\"] = filtered[\"metagenome\"].apply(\n", " lambda x: \"{}\".format(x, x)\n", " )\n", "\n", " display(HTML(with_link[filters]\n", " .sort_values(by=\"containment\", ascending=False)\n", " .to_html(render_links=True, escape=False,)))\n", " \n", "\n", "candidatepicker = interactive(update_candidate, candidate=widgets.Dropdown(\n", " options=filtered['MAG'].unique(),\n", " value='TOBG_NP-110',\n", " description='MAG name',\n", " disabled=False\n", "))\n", "display(candidatepicker)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }