{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from biom.table import Table\n",
    "from biom.util import biom_open\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from skbio import TreeNode \n",
    "import math\n",
    "from qiime2 import Artifact\n",
    "from qiime2.plugins import empress\n",
    "from qiime2 import Visualization\n",
    "import scipy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def assign_counts(tree,old_table, young_table, tip_names):\n",
    "    \"\"\"\n",
    "    Counts how many old/young samples for each node in the tree. \n",
    "    \"\"\"\n",
    "    for node in tree.postorder(include_self=False):\n",
    "        node.old_count = 0\n",
    "        node.young_count = 0\n",
    "        if node.is_tip():\n",
    "            node.old_count = old_table[node.name].sum()\n",
    "            node.young_count = young_table[node.name].sum()\n",
    "        else:\n",
    "            tips = [tip.name for tip in node.tips()]\n",
    "            node.old_count = old_table[tips].max(axis=1).sum()\n",
    "            node.young_count = young_table[tips].max(axis=1).sum()\n",
    "            \n",
    "def calc_old_young_log(tree, num_old_samples, num_young_samples):\n",
    "    \"\"\"\n",
    "    Find log ratio of old / young samples for each node in the tree\n",
    "    \"\"\"\n",
    "    young_min = 0\n",
    "    for node in tree.postorder(include_self=False):\n",
    "        if node.old_count == 0 and node.young_count == 0:\n",
    "            # node was not found in a single old or young sample\n",
    "            node.old_young_log = 0\n",
    "        elif node.old_count == 0:\n",
    "            # node was only found in young samples\n",
    "            node.old_young_log = -np.inf\n",
    "        elif node.young_count == 0:\n",
    "            # node was only found in old samples\n",
    "            node.old_young_log = np.inf\n",
    "        else:\n",
    "            # calculate the log ratio of old / young samples\n",
    "            node.old_young_log = math.log(node.old_count / node.young_count,2)\n",
    "            # normiliztion term\n",
    "            node.old_young_log -= math.log(num_old_samples/ num_young_samples,2)\n",
    "            if node.is_tip():\n",
    "                young_min = min(young_min, node.old_young_log)\n",
    "    return young_min\n",
    "\n",
    "def assign_old_young_status(tree, young_min):\n",
    "    \"\"\"\n",
    "    Assign old or young status for each node based on its log ratio.\n",
    "    \"\"\"\n",
    "    for node in tree.postorder(include_self=False):\n",
    "        if node.old_young_log > 0:\n",
    "            node.age = \"old\"\n",
    "        elif node.old_young_log < 0:\n",
    "            node.age = \"young\"\n",
    "        else:\n",
    "            node.age = \"no difference\"\n",
    "        if node.old_young_log > young_min:\n",
    "            node.old_young_log = young_min\n",
    "        if node.old_young_log < -1 * young_min:\n",
    "            node.old_young_log = -1 * young_min\n",
    "            \n",
    "def assign_name(tree):\n",
    "    \"\"\"\n",
    "    Assign unique ids for each node in the tree.\n",
    "    \"\"\"\n",
    "    i = 0\n",
    "    for node in tree.postorder(include_self=True):\n",
    "        if not type(node.name) == type(\"str\"):\n",
    "            node.name = \"blank_\" + str(i)\n",
    "            i += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "table_MG_path = 'data/finrisk/anonymized-finrisk-MG-BL_AGE-filtered_rarefied_table.biom'\n",
    "finrisk_metadata_path = 'data/finrisk/anonymized.age-only.finrisk.metadata.txt'\n",
    "wol_tree_path = 'data/wol/wol-tree.nwk'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load BIOM table and sample metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# with biom_open(\"finrisk-MG-BL_AGE-filtered_rarefied_table.biom\") as f:\n",
    "with biom_open(table_MG_path) as f:\n",
    "    table = Table.from_hdf5(f)\n",
    "table.pa()\n",
    "pd_table = (table.to_dataframe(dense=True))\n",
    "tip_names = set(pd_table.index.to_list())\n",
    "\n",
    "# metadata = pd.read_csv(\"gotu.shared.metadata.txt\", sep=\"\\t\", index_col=0)\n",
    "metadata = pd.read_csv(finrisk_metadata_path, sep=\"\\t\", index_col=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Filter/split sample metadata into group >= 60 and <= 35"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "old = metadata.loc[metadata[\"BL_AGE\"] >= 60]\n",
    "young = metadata.loc[metadata[\"BL_AGE\"] <= 35]\n",
    "\n",
    "old_samples = [s for s in old.index.tolist() if table.exists(s)]\n",
    "young_sample = [s for s in young.index.tolist() if table.exists(s)]\n",
    "\n",
    "old_table = (pd_table[old_samples]).T\n",
    "young_table = (pd_table[young_sample]).T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load tree and assign log ratios"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tree = TreeNode.read(wol_tree_path,format=\"newick\")\n",
    "assign_name(tree)\n",
    "tree = tree.shear(tip_names)\n",
    "assign_counts(tree, old_table, young_table, tip_names)\n",
    "young_min = calc_old_young_log(tree, len(old_samples), len(young_sample))\n",
    "young_min = abs(young_min)\n",
    "assign_old_young_status(tree, young_min)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Format taxonomy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tax = pd.read_csv(\"lineages.txt\", sep=\"\\t\", index_col=0)\n",
    "tax = tax.loc[tip_names]\n",
    "tax[[\"Level 1\", \"Level 2\",\"Level 3\", \"Level 4\", \"Level 5\", \"Level 6\", \"Level 7\"]] = \\\n",
    "    tax.Taxonmy.str.split(\";\", expand=True)\n",
    "tax.loc[[\"G001765415\",\n",
    "\"G001899365\",\n",
    "\"G001899425\",\n",
    "\"G001917115\",\n",
    "\"G001917235\",\n",
    "\"G001917285\",\n",
    "\"G000980455\",\n",
    "\"G000431115\",\n",
    "\"G000431315\",\n",
    "\"G000431555\",\n",
    "\"G000433095\",\n",
    "\"G000433235\",\n",
    "\"G000437435\",\n",
    "\"G000437655\",\n",
    "\"G000980375\"], \"Level 2\"] = \"p__Melainabacteria\"\n",
    "tax.loc[[\n",
    "\"G000432575\",\n",
    "\"G000433255\",\n",
    "\"G000433455\",\n",
    "\"G000433475\",\n",
    "\"G000433875\",\n",
    "\"G000434835\",\n",
    "\"G000436255\",\n",
    "\"G000437335\",\n",
    "\"G000438295\",\n",
    "\"G000438415\"\n",
    "], \"Level 2\"] = \"p__Firmicutes\"\n",
    "tax = tax.applymap(lambda x: x.split(\"__\")[1].split(\"_\")[0])\n",
    "tax = tax.to_dict()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create feature metadata and sample metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"old_young_log.tsv\", 'w') as f:\n",
    "    f.write(\"feature id\\told_young_log_ratio\\tage\\tLevel 1\\tLevel 2\\tLevel 3\\tLevel 4\\tLevel 5\\tLevel 6\\tLevel 7\\n\")\n",
    "    for node in tree.postorder(include_self=False):\n",
    "        if node.is_tip():\n",
    "            f.write(\"\" + node.name + \\\n",
    "                    \"\\t\" + str(node.old_young_log) + \\\n",
    "                    \"\\t\" + node.age  +\n",
    "                    \"\\t\" + tax[\"Level 1\"][node.name] +\n",
    "                    \"\\t\" + tax[\"Level 2\"][node.name] +\n",
    "                    \"\\t\" + tax[\"Level 3\"][node.name] +\n",
    "                    \"\\t\" + tax[\"Level 4\"][node.name] +\n",
    "                    \"\\t\" + tax[\"Level 5\"][node.name] +\n",
    "                    \"\\t\" + tax[\"Level 6\"][node.name] +\n",
    "                    \"\\t\" + tax[\"Level 7\"][node.name] +\n",
    "                    \"\\n\")\n",
    "        else:\n",
    "            if type(node.name) == type(\"str\"):\n",
    "                f.write(\"\" + node.name + \\\n",
    "                    \"\\t\" + str(node.old_young_log) + \\\n",
    "                    \"\\t\" + node.age  + \\\n",
    "                    \"\\t\" + \"\" + \\\n",
    "                    \"\\t\" + \"\" +\n",
    "                    \"\\t\" + \"\" +\n",
    "                    \"\\t\" + \"\" +\n",
    "                    \"\\t\" + \"\" +\n",
    "                    \"\\t\" + \"\" +\n",
    "                    \"\\t\" + \"\" +\n",
    "                    \"\\n\")\n",
    "        \n",
    "metadata.loc[young_sample, \"age_status\"] = \"young\"\n",
    "metadata.loc[old_samples, \"age_status\"] = \"old\"\n",
    "metadata = metadata.loc[old_samples + young_sample]\n",
    "metadata.to_csv(\"s-meta.tsv\", sep=\"\\t\", na_rep=\"NA\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create qiime2 Artifacts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "table = table.filter(old_samples + young_sample, axis=\"sample\", inplace=False)\n",
    "Artifact.import_data(\"FeatureTable[Frequency]\", table).save(\"table.qza\")\n",
    "Artifact.import_data(\"Phylogeny[Rooted]\", tree).save(\"dec_shear_tree.qza\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create EMPress plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!qiime empress community-plot \\\n",
    "    --i-tree dec_shear_tree.qza \\\n",
    "    --i-feature-table table.qza \\\n",
    "    --m-sample-metadata-file s-meta.tsv \\\n",
    "    --m-feature-metadata-file old_young_log.tsv \\\n",
    "    --o-visualization fig2c.qzv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load EMPress visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Visualization.load(\"fig2c.qzv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}