{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MinHashing all the things: a quick analysis of MAG search results\n",
    "\n",
    "Companion notebook for https://blog.luizirber.org/2020/07/23/mag-results/\n",
    "\n",
    "## Preparing some metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[33mBuilding DAG of jobs...\u001b[0m\n",
      "\u001b[33mNothing to be done.\u001b[0m\n",
      "\u001b[33mComplete log: /home/luizirber/work/sourmash-bio/2020-07-22-mag-search/.snakemake/log/2020-07-24T144113.032349.snakemake.log\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!snakemake -j1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ipywidgets import interactive\n",
    "import ipywidgets as widgets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Loading the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                 MAG  metagenome  containment\n",
      "89721     TOBG_NP-28  SRR2103020     0.500144\n",
      "53860     TOBG_NP-42  SRR7168048     0.500169\n",
      "28907    TOBG_EAC-55  SRR7479580     0.500182\n",
      "73862     TOBG_SP-43  SRR7986296     0.500218\n",
      "27250  TOBG_SAT-1356  SRR6713912     0.500264\n",
      "...              ...         ...          ...\n",
      "60804    TOBG_RS-626  ERR4013358     0.988004\n",
      "66828    TOBG_SP-208  SRR5868539     0.989301\n",
      "66807   TOBG_SP-4095  SRR5868539     0.990476\n",
      "6268     TOBG_EAC-96  SRR8159436     0.992379\n",
      "66794    TOBG_NP-110  SRR5868539     0.993755\n",
      "\n",
      "[23644 rows x 3 columns]\n",
      "unique SRA runs:  6398\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "from IPython.display import HTML\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "data = pd.read_csv(\"results.csv\",\n",
    "                   sep=\",\",\n",
    "                   quotechar=\"'\",\n",
    "                   names=[\"MAG\", \"metagenome\", \"containment\"])\n",
    "\n",
    "# These MAGs come from TARA Oceans, and some of them are in Parks 8k, so let's remove them from results\n",
    "tara_metag = pd.read_table(\"inputs/tara_runinfo.csv\", sep=\",\", header=0, usecols=[\"Run\"])\n",
    "parks_8k = pd.read_table(\"inputs/parks_runinfo.csv\", sep=\",\", header=0, usecols=[\"Run\"])\n",
    "\n",
    "# Fix names so it's easier to query\n",
    "data['MAG'] = data['MAG'].str.replace(r\"'(?P<id>.*)'\", lambda m: m.group(\"id\"))\n",
    "data['metagenome'] = data['metagenome'].str.replace(r\".*/(?P<id>.*).sig.*\", lambda m: m.group(\"id\"))\n",
    "\n",
    "# Actually remove TARA and Parks\n",
    "to_keep = set(data['metagenome'].values).difference(set(tara_metag[\"Run\"].values))\n",
    "to_keep = to_keep.difference(set(parks_8k[\"Run\"].values))\n",
    "filtered = data[data['metagenome'].isin(to_keep)]\n",
    "print(filtered[filtered['containment'] > 0.5].sort_values(by=\"containment\"))\n",
    "\n",
    "# MAGs metadata\n",
    "taxonomy = pd.read_excel(\"inputs/tully_mag_taxonomy.xlsx\", header=1).set_index(\"Genome ID\")\n",
    "stats = pd.read_excel(\"inputs/tully_mag_stats.xlsx\", header=1).set_index(\"Genome ID\")\n",
    "\n",
    "print(\"unique SRA runs: \", len(filtered['metagenome'].unique()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2291"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(filtered[\"MAG\"].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1407"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(filtered[filtered['containment'] > 0.5][\"MAG\"].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2938"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(filtered[filtered['containment'] > 0.5][\"metagenome\"].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered = filtered[filtered['containment'] > 0.5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Picking a candidate: TOBG_NP-110, I choose you!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e47472e827a64ee48196c6ab760ba550",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "interactive(children=(Dropdown(description='MAG name', index=889, options=('TOBG_SP-231', 'TOBG_NAT-188', 'TOB…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "def update_candidate(candidate): \n",
    "    print(taxonomy.loc[candidate])\n",
    "    print()\n",
    "    print(stats.loc[candidate])\n",
    "    print()\n",
    "\n",
    "    filters = (\n",
    "      (filtered[\"MAG\"] == candidate) &\n",
    "      (filtered['containment'] > 0.5)\n",
    "    )\n",
    "    \n",
    "    with_link = filtered.copy()\n",
    "    with_link[\"metagenome\"] = filtered[\"metagenome\"].apply(\n",
    "        lambda x: \"<a href='https://trace.ncbi.nlm.nih.gov/Traces/sra/?run={}'>{}</a>\".format(x, x)\n",
    "    )\n",
    "\n",
    "    display(HTML(with_link[filters]\n",
    "         .sort_values(by=\"containment\", ascending=False)\n",
    "         .to_html(render_links=True, escape=False,)))\n",
    "    \n",
    "\n",
    "candidatepicker = interactive(update_candidate, candidate=widgets.Dropdown(\n",
    "    options=filtered['MAG'].unique(),\n",
    "    value='TOBG_NP-110',\n",
    "    description='MAG name',\n",
    "    disabled=False\n",
    "))\n",
    "display(candidatepicker)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}