{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from biom.table import Table\n", "from biom.util import biom_open\n", "import pandas as pd\n", "import numpy as np\n", "from skbio import TreeNode \n", "import math\n", "from qiime2 import Artifact\n", "from qiime2.plugins import empress\n", "from qiime2 import Visualization\n", "import scipy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def assign_counts(tree,old_table, young_table, tip_names):\n", " \"\"\"\n", " Counts how many old/young samples for each node in the tree. \n", " \"\"\"\n", " for node in tree.postorder(include_self=False):\n", " node.old_count = 0\n", " node.young_count = 0\n", " if node.is_tip():\n", " node.old_count = old_table[node.name].sum()\n", " node.young_count = young_table[node.name].sum()\n", " else:\n", " tips = [tip.name for tip in node.tips()]\n", " node.old_count = old_table[tips].max(axis=1).sum()\n", " node.young_count = young_table[tips].max(axis=1).sum()\n", " \n", "def calc_old_young_log(tree, num_old_samples, num_young_samples):\n", " \"\"\"\n", " Find log ratio of old / young samples for each node in the tree\n", " \"\"\"\n", " young_min = 0\n", " for node in tree.postorder(include_self=False):\n", " if node.old_count == 0 and node.young_count == 0:\n", " # node was not found in a single old or young sample\n", " node.old_young_log = 0\n", " elif node.old_count == 0:\n", " # node was only found in young samples\n", " node.old_young_log = -np.inf\n", " elif node.young_count == 0:\n", " # node was only found in old samples\n", " node.old_young_log = np.inf\n", " else:\n", " # calculate the log ratio of old / young samples\n", " node.old_young_log = math.log(node.old_count / node.young_count,2)\n", " # normiliztion term\n", " node.old_young_log -= math.log(num_old_samples/ num_young_samples,2)\n", " if node.is_tip():\n", " young_min = min(young_min, node.old_young_log)\n", " return young_min\n", "\n", "def assign_old_young_status(tree, young_min):\n", " \"\"\"\n", " Assign old or young status for each node based on its log ratio.\n", " \"\"\"\n", " for node in tree.postorder(include_self=False):\n", " if node.old_young_log > 0:\n", " node.age = \"old\"\n", " elif node.old_young_log < 0:\n", " node.age = \"young\"\n", " else:\n", " node.age = \"no difference\"\n", " if node.old_young_log > young_min:\n", " node.old_young_log = young_min\n", " if node.old_young_log < -1 * young_min:\n", " node.old_young_log = -1 * young_min\n", " \n", "def assign_name(tree):\n", " \"\"\"\n", " Assign unique ids for each node in the tree.\n", " \"\"\"\n", " i = 0\n", " for node in tree.postorder(include_self=True):\n", " if not type(node.name) == type(\"str\"):\n", " node.name = \"blank_\" + str(i)\n", " i += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "table_MG_path = 'data/finrisk/anonymized-finrisk-MG-BL_AGE-filtered_rarefied_table.biom'\n", "finrisk_metadata_path = 'data/finrisk/anonymized.age-only.finrisk.metadata.txt'\n", "wol_tree_path = 'data/wol/wol-tree.nwk'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load BIOM table and sample metadata" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# with biom_open(\"finrisk-MG-BL_AGE-filtered_rarefied_table.biom\") as f:\n", "with biom_open(table_MG_path) as f:\n", " table = Table.from_hdf5(f)\n", "table.pa()\n", "pd_table = (table.to_dataframe(dense=True))\n", "tip_names = set(pd_table.index.to_list())\n", "\n", "# metadata = pd.read_csv(\"gotu.shared.metadata.txt\", sep=\"\\t\", index_col=0)\n", "metadata = pd.read_csv(finrisk_metadata_path, sep=\"\\t\", index_col=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Filter/split sample metadata into group >= 60 and <= 35" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "old = metadata.loc[metadata[\"BL_AGE\"] >= 60]\n", "young = metadata.loc[metadata[\"BL_AGE\"] <= 35]\n", "\n", "old_samples = [s for s in old.index.tolist() if table.exists(s)]\n", "young_sample = [s for s in young.index.tolist() if table.exists(s)]\n", "\n", "old_table = (pd_table[old_samples]).T\n", "young_table = (pd_table[young_sample]).T" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load tree and assign log ratios" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tree = TreeNode.read(wol_tree_path,format=\"newick\")\n", "assign_name(tree)\n", "tree = tree.shear(tip_names)\n", "assign_counts(tree, old_table, young_table, tip_names)\n", "young_min = calc_old_young_log(tree, len(old_samples), len(young_sample))\n", "young_min = abs(young_min)\n", "assign_old_young_status(tree, young_min)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Format taxonomy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tax = pd.read_csv(\"lineages.txt\", sep=\"\\t\", index_col=0)\n", "tax = tax.loc[tip_names]\n", "tax[[\"Level 1\", \"Level 2\",\"Level 3\", \"Level 4\", \"Level 5\", \"Level 6\", \"Level 7\"]] = \\\n", " tax.Taxonmy.str.split(\";\", expand=True)\n", "tax.loc[[\"G001765415\",\n", "\"G001899365\",\n", "\"G001899425\",\n", "\"G001917115\",\n", "\"G001917235\",\n", "\"G001917285\",\n", "\"G000980455\",\n", "\"G000431115\",\n", "\"G000431315\",\n", "\"G000431555\",\n", "\"G000433095\",\n", "\"G000433235\",\n", "\"G000437435\",\n", "\"G000437655\",\n", "\"G000980375\"], \"Level 2\"] = \"p__Melainabacteria\"\n", "tax.loc[[\n", "\"G000432575\",\n", "\"G000433255\",\n", "\"G000433455\",\n", "\"G000433475\",\n", "\"G000433875\",\n", "\"G000434835\",\n", "\"G000436255\",\n", "\"G000437335\",\n", "\"G000438295\",\n", "\"G000438415\"\n", "], \"Level 2\"] = \"p__Firmicutes\"\n", "tax = tax.applymap(lambda x: x.split(\"__\")[1].split(\"_\")[0])\n", "tax = tax.to_dict()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create feature metadata and sample metadata" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(\"old_young_log.tsv\", 'w') as f:\n", " f.write(\"feature id\\told_young_log_ratio\\tage\\tLevel 1\\tLevel 2\\tLevel 3\\tLevel 4\\tLevel 5\\tLevel 6\\tLevel 7\\n\")\n", " for node in tree.postorder(include_self=False):\n", " if node.is_tip():\n", " f.write(\"\" + node.name + \\\n", " \"\\t\" + str(node.old_young_log) + \\\n", " \"\\t\" + node.age +\n", " \"\\t\" + tax[\"Level 1\"][node.name] +\n", " \"\\t\" + tax[\"Level 2\"][node.name] +\n", " \"\\t\" + tax[\"Level 3\"][node.name] +\n", " \"\\t\" + tax[\"Level 4\"][node.name] +\n", " \"\\t\" + tax[\"Level 5\"][node.name] +\n", " \"\\t\" + tax[\"Level 6\"][node.name] +\n", " \"\\t\" + tax[\"Level 7\"][node.name] +\n", " \"\\n\")\n", " else:\n", " if type(node.name) == type(\"str\"):\n", " f.write(\"\" + node.name + \\\n", " \"\\t\" + str(node.old_young_log) + \\\n", " \"\\t\" + node.age + \\\n", " \"\\t\" + \"\" + \\\n", " \"\\t\" + \"\" +\n", " \"\\t\" + \"\" +\n", " \"\\t\" + \"\" +\n", " \"\\t\" + \"\" +\n", " \"\\t\" + \"\" +\n", " \"\\t\" + \"\" +\n", " \"\\n\")\n", " \n", "metadata.loc[young_sample, \"age_status\"] = \"young\"\n", "metadata.loc[old_samples, \"age_status\"] = \"old\"\n", "metadata = metadata.loc[old_samples + young_sample]\n", "metadata.to_csv(\"s-meta.tsv\", sep=\"\\t\", na_rep=\"NA\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create qiime2 Artifacts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "table = table.filter(old_samples + young_sample, axis=\"sample\", inplace=False)\n", "Artifact.import_data(\"FeatureTable[Frequency]\", table).save(\"table.qza\")\n", "Artifact.import_data(\"Phylogeny[Rooted]\", tree).save(\"dec_shear_tree.qza\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create EMPress plot" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!qiime empress community-plot \\\n", " --i-tree dec_shear_tree.qza \\\n", " --i-feature-table table.qza \\\n", " --m-sample-metadata-file s-meta.tsv \\\n", " --m-feature-metadata-file old_young_log.tsv \\\n", " --o-visualization fig2c.qzv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load EMPress visualization" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Visualization.load(\"fig2c.qzv\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 4 }