{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "res = pd.read_csv(\"species_cranky_wozniak\", sep = \"\\t\", header = None, \n", " names = ['taxid', 'rank', 'taxpath', 'taxpathsn', 'percentage'],\n", " skiprows= 1)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
taxidranktaxpathtaxpathsnpercentage
056636species2157|28889|183924|114380|2272|56635|56636Archaea|Crenarchaeota|Thermoprotei|Desulfuroco...0.0
1477693species2157|28889|183924|114380|2272|2273|477693Archaea|Crenarchaeota|Thermoprotei|Desulfuroco...0.0
2160233species2157|28889|183924|114380|2272|54258|160233Archaea|Crenarchaeota|Thermoprotei|Desulfuroco...0.0
354248species2157|28889|183924|114380|2307|54247|54248Archaea|Crenarchaeota|Thermoprotei|Desulfuroco...0.0
42285species2157|28889|183924|2281|118883|2284|2285Archaea|Crenarchaeota|Thermoprotei|Sulfolobale...0.0
\n", "
" ], "text/plain": [ " taxid rank taxpath \\\n", "0 56636 species 2157|28889|183924|114380|2272|56635|56636 \n", "1 477693 species 2157|28889|183924|114380|2272|2273|477693 \n", "2 160233 species 2157|28889|183924|114380|2272|54258|160233 \n", "3 54248 species 2157|28889|183924|114380|2307|54247|54248 \n", "4 2285 species 2157|28889|183924|2281|118883|2284|2285 \n", "\n", " taxpathsn percentage \n", "0 Archaea|Crenarchaeota|Thermoprotei|Desulfuroco... 0.0 \n", "1 Archaea|Crenarchaeota|Thermoprotei|Desulfuroco... 0.0 \n", "2 Archaea|Crenarchaeota|Thermoprotei|Desulfuroco... 0.0 \n", "3 Archaea|Crenarchaeota|Thermoprotei|Desulfuroco... 0.0 \n", "4 Archaea|Crenarchaeota|Thermoprotei|Sulfolobale... 0.0 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1029, 5)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.shape" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "\n", "def summarize_all_levels(df, ranks):\n", " new_rows = []\n", "\n", " for (_, _, taxpath, taxpathsn, percentage) in df.itertuples(index=False, name=None):\n", " lineage_values = taxpath.split(\"|\")\n", " lineage_names = taxpathsn.split(\"|\")\n", "\n", " for i, (rank, tax_id) in enumerate(zip(ranks[:-1], lineage_values), 1):\n", " taxpath = \"|\".join(lineage_values[:i])\n", " taxpathsn = \"|\".join(lineage_names[:i])\n", " new_rows.append([tax_id, rank, taxpath, taxpathsn, percentage])\n", "\n", " new_df = pd.DataFrame(new_rows, columns=df.columns)\n", " return new_df.groupby(\n", " ['taxid', 'rank', 'taxpath', 'taxpathsn'], as_index=False\n", " ).sum()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "tax_ranks = ranks = \"superkingdom|phylum|class|order|family|genus|species\".split(\"|\")\n", "new_df = summarize_all_levels(res, tax_ranks)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
taxidranktaxpathtaxpathsnpercentage
010genus2|1224|1236|72274|135621|10Bacteria|Proteobacteria|Gammaproteobacteria|Ps...0.010060
1100715genus2|1090|191410|191411|191412|100715Bacteria|Chlorobi|Chlorobia|Chlorobiales|Chlor...0.000000
2100883genus2|1239|526524|526525|128827|100883Bacteria|Firmicutes|Erysipelotrichia|Erysipelo...0.000000
31016genus2|976|117743|200644|49546|1016Bacteria|Bacteroidetes|Flavobacteriia|Flavobac...0.000000
41021genus2|1224|1236|72273|135617|1021Bacteria|Proteobacteria|Gammaproteobacteria|Th...0.000000
..................
103397050genus2|1224|28211|204455|31989|97050Bacteria|Proteobacteria|Alphaproteobacteria|Rh...0.191146
1034972family2|1239|186801|53433|972Bacteria|Firmicutes|Clostridia|Halanaerobiales...0.010060
1035976phylum2|976Bacteria|Bacteroidetes2.263578
1036978genus2|976|768503|768507|89373|978Bacteria|Bacteroidetes|Cytophagia|Cytophagales...0.010060
1037995019family2|1224|28216|80840|995019Bacteria|Proteobacteria|Betaproteobacteria|Bur...0.010060
\n", "

1038 rows × 5 columns

\n", "
" ], "text/plain": [ " taxid rank taxpath \\\n", "0 10 genus 2|1224|1236|72274|135621|10 \n", "1 100715 genus 2|1090|191410|191411|191412|100715 \n", "2 100883 genus 2|1239|526524|526525|128827|100883 \n", "3 1016 genus 2|976|117743|200644|49546|1016 \n", "4 1021 genus 2|1224|1236|72273|135617|1021 \n", "... ... ... ... \n", "1033 97050 genus 2|1224|28211|204455|31989|97050 \n", "1034 972 family 2|1239|186801|53433|972 \n", "1035 976 phylum 2|976 \n", "1036 978 genus 2|976|768503|768507|89373|978 \n", "1037 995019 family 2|1224|28216|80840|995019 \n", "\n", " taxpathsn percentage \n", "0 Bacteria|Proteobacteria|Gammaproteobacteria|Ps... 0.010060 \n", "1 Bacteria|Chlorobi|Chlorobia|Chlorobiales|Chlor... 0.000000 \n", "2 Bacteria|Firmicutes|Erysipelotrichia|Erysipelo... 0.000000 \n", "3 Bacteria|Bacteroidetes|Flavobacteriia|Flavobac... 0.000000 \n", "4 Bacteria|Proteobacteria|Gammaproteobacteria|Th... 0.000000 \n", "... ... ... \n", "1033 Bacteria|Proteobacteria|Alphaproteobacteria|Rh... 0.191146 \n", "1034 Bacteria|Firmicutes|Clostridia|Halanaerobiales... 0.010060 \n", "1035 Bacteria|Bacteroidetes 2.263578 \n", "1036 Bacteria|Bacteroidetes|Cytophagia|Cytophagales... 0.010060 \n", "1037 Bacteria|Proteobacteria|Betaproteobacteria|Bur... 0.010060 \n", "\n", "[1038 rows x 5 columns]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "94.30576299999994" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(res['percentage'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }