{ "cells": [ { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "import argparse\n", "import os\n", "import json\n", "import logging\n", "import string\n", "import collections\n", "\n", "import pandas as pd\n", "import numpy as np\n", "from sqlalchemy import create_engine\n", "import networkx as nx\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "if os.getcwd().endswith('notebook'):\n", " os.chdir('..')\n", "\n", "from rna_learn.alphabet import CODON_REDUNDANCY\n", "from rna_learn.codon_bias.graph import load_codon_bias" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "sns.set(palette='colorblind', font_scale=1.3)\n", "palette = sns.color_palette()\n", "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s (%(levelname)s) %(message)s\")\n", "logger = logging.getLogger(__name__)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "db_path = os.path.join(os.getcwd(), 'data/db/seq.db')\n", "engine = create_engine(f'sqlite+pysqlite:///{db_path}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load distance matrix" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2680, 2680)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distance_matrix_path = os.path.join(os.getcwd(), 'data/distance_matrix.npy')\n", "distance_matrix = np.load(distance_matrix_path, allow_pickle=True)\n", "distance_matrix.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load codon bias" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
species_taxidin_test_setAAA_ratioAAG_ratioAAT_ratioAAC_ratioACT_ratioACC_ratioACA_ratioACG_ratio...motilityrange_salinitycell_shapeisolation_sourcedoubling_hgenome_sizegc_contentcoding_genestRNA_genesrRNA16S_genes
07True0.0825500.9174500.3277240.6722760.0245110.5753530.0362620.363875...yesNoneNonehost_plantNaN5369771.50067.3004713.66753.0003.0
19False0.9196270.0803730.8581680.1418320.4545690.0500260.4460180.049387...yesNonecoccobacillushost_animal_ectotherm35.40601699.24325.469517.54930.4851.0
211True0.0073020.9926980.0107300.9892700.0085860.4255270.0157890.550098...yesNonebacillushost_animal_endothermNaN3526440.80073.8053139.33345.0002.0
314True0.6648660.3351340.7755710.2244290.4582030.1880520.3041830.049563...noNonebacilluswater_hotspring2.471959987.60033.7001876.33346.0002.0
419True0.5518100.4481900.4405590.5594410.0991340.5914780.0977960.211592...yesNonebacilluspetroleumNaN3722544.66755.1003222.66754.0002.0
\n", "

5 rows × 86 columns

\n", "
" ], "text/plain": [ " species_taxid in_test_set AAA_ratio AAG_ratio AAT_ratio AAC_ratio \\\n", "0 7 True 0.082550 0.917450 0.327724 0.672276 \n", "1 9 False 0.919627 0.080373 0.858168 0.141832 \n", "2 11 True 0.007302 0.992698 0.010730 0.989270 \n", "3 14 True 0.664866 0.335134 0.775571 0.224429 \n", "4 19 True 0.551810 0.448190 0.440559 0.559441 \n", "\n", " ACT_ratio ACC_ratio ACA_ratio ACG_ratio ... motility range_salinity \\\n", "0 0.024511 0.575353 0.036262 0.363875 ... yes None \n", "1 0.454569 0.050026 0.446018 0.049387 ... yes None \n", "2 0.008586 0.425527 0.015789 0.550098 ... yes None \n", "3 0.458203 0.188052 0.304183 0.049563 ... no None \n", "4 0.099134 0.591478 0.097796 0.211592 ... yes None \n", "\n", " cell_shape isolation_source doubling_h genome_size gc_content \\\n", "0 None host_plant NaN 5369771.500 67.300 \n", "1 coccobacillus host_animal_ectotherm 35.40 601699.243 25.469 \n", "2 bacillus host_animal_endotherm NaN 3526440.800 73.805 \n", "3 bacillus water_hotspring 2.47 1959987.600 33.700 \n", "4 bacillus petroleum NaN 3722544.667 55.100 \n", "\n", " coding_genes tRNA_genes rRNA16S_genes \n", "0 4713.667 53.000 3.0 \n", "1 517.549 30.485 1.0 \n", "2 3139.333 45.000 2.0 \n", "3 1876.333 46.000 2.0 \n", "4 3222.667 54.000 2.0 \n", "\n", "[5 rows x 86 columns]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "codon_bias_path = os.path.join(os.getcwd(), 'data/species_codon_ratios.csv')\n", "species_codon_df = load_codon_bias(engine, codon_bias_path)\n", "species_codon_df.head()" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "ratio_columns = [c for c in species_codon_df.columns if c.endswith('_ratio')]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Distance Threshold" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def compute_appropriate_threshold_distance(species_codon_df, distance_matrix, s_stds): \n", " output_columns = [\n", " 'species_taxid',\n", " 'distance_mean',\n", " 'distance_std',\n", " ]\n", " \n", " data = []\n", " for ix in range(len(species_codon_df)):\n", " species = species_codon_df.loc[ix]\n", "\n", " d = [\n", " v for i, v in enumerate(distance_matrix[ix])\n", " if i != ix\n", " ]\n", "\n", " data.append([\n", " species['species_taxid'],\n", " np.mean(d) if len(d) > 0 else 0.,\n", " np.std(d) if len(d) > 0 else 0.,\n", " ])\n", " \n", " distance_df = pd.DataFrame(data, columns=output_columns)\n", " \n", " return (distance_df['distance_mean'] - s_stds * distance_df['distance_std']).mean()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "def compute_neighbour_stats(species_codon_df, distance_matrix, s_stds):\n", " threshold = compute_appropriate_threshold_distance(species_codon_df, distance_matrix, s_stds)\n", " \n", " species_taxids = species_codon_df['species_taxid'].values\n", " \n", " n_under_threshold = []\n", " for i, species_taxid in enumerate(species_taxids):\n", " distances = np.delete(distance_matrix[i], i)\n", " n_under_threshold.append(len([d for d in distances if d <= threshold]))\n", " \n", " n_zeros = len([v for v in n_under_threshold if v == 0])\n", " \n", " return threshold, {\n", " 'mean': np.round(np.mean(n_under_threshold), 2),\n", " 'std': np.round(np.std(n_under_threshold), 2),\n", " 'min': np.min(n_under_threshold),\n", " 'max': np.max(n_under_threshold),\n", " 'n_zeros': n_zeros,\n", " 'n_zeros_percent': np.round(100 * n_zeros / len(species_taxids), 2),\n", " }" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Threshold: 0.0982\n" ] }, { "data": { "text/plain": [ "{'mean': 186.73,\n", " 'std': 113.32,\n", " 'min': 0,\n", " 'max': 476,\n", " 'n_zeros': 6,\n", " 'n_zeros_percent': 0.22}" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "threshold, stats = compute_neighbour_stats(species_codon_df, distance_matrix, s_stds=1.3)\n", "print(f'Threshold: {threshold:.4f}')\n", "stats" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
species_taxidspeciesphylumsuperkingdom
6832337Thermotoga neapolitanaThermotogaeBacteria
119057487Pseudothermotoga hypogeaThermotogaeBacteria
120158290Archaeoglobus veneficusEuryarchaeotaArchaea
146393929Thermotoga petrophilaThermotogaeBacteria
146493930Thermotoga naphthophilaThermotogaeBacteria
2334565033Geoglobus acetivoransEuryarchaeotaArchaea
25291184387Mesotoga primaThermotogaeBacteria
25581236046Mesotoga inferaThermotogaeBacteria
\n", "
" ], "text/plain": [ " species_taxid species phylum superkingdom\n", "683 2337 Thermotoga neapolitana Thermotogae Bacteria\n", "1190 57487 Pseudothermotoga hypogea Thermotogae Bacteria\n", "1201 58290 Archaeoglobus veneficus Euryarchaeota Archaea\n", "1463 93929 Thermotoga petrophila Thermotogae Bacteria\n", "1464 93930 Thermotoga naphthophila Thermotogae Bacteria\n", "2334 565033 Geoglobus acetivorans Euryarchaeota Archaea\n", "2529 1184387 Mesotoga prima Thermotogae Bacteria\n", "2558 1236046 Mesotoga infera Thermotogae Bacteria" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ix = species_codon_df[species_codon_df['species_taxid'] == 2336].index[0]\n", "distances = distance_matrix[ix]\n", "neighbours_ix = [i for i, d in enumerate(distances) if d <= threshold and ix != i]\n", "species_codon_df.loc[neighbours_ix][['species_taxid', 'species', 'phylum', 'superkingdom']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load graph" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 287 ms, sys: 53.1 ms, total: 340 ms\n", "Wall time: 342 ms\n" ] } ], "source": [ "%%time\n", "graph_path = os.path.join(os.getcwd(), 'data/codon_bias_graph.gpickle')\n", "graph = nx.read_gpickle(graph_path)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 38.7 ms, sys: 1.2 ms, total: 39.9 ms\n", "Wall time: 39.1 ms\n" ] } ], "source": [ "%%time\n", "connected_components = list(nx.connected_components(graph))" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Component 1: 2,672 species\n", "Component 2: 2 species\n", "Component 3: 1 species\n", "Component 4: 1 species\n", "Component 5: 1 species\n", "Component 6: 1 species\n", "Component 7: 1 species\n", "Component 8: 1 species\n" ] } ], "source": [ "for i, component in enumerate(connected_components):\n", " print(f'Component {i+1}: {len(component):,} species')" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "from networkx.algorithms import community as nx_community" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 8.05 s, sys: 13.2 ms, total: 8.07 s\n", "Wall time: 8.08 s\n" ] } ], "source": [ "%%time\n", "communities = list(nx_community.asyn_lpa_communities(graph))" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Community 1: 1,091 species\n", "Community 2: 271 species\n", "Community 3: 1,251 species\n", " - Species 58290 is in community 3\n", " - Species 565033 is in community 3\n", "Community 4: 2 species\n", "Community 5: 2 species\n", "Community 6: 38 species\n", "Community 7: 1 species\n", "Community 8: 1 species\n", "Community 9: 1 species\n", "Community 10: 4 species\n", "Community 11: 1 species\n", "Community 12: 5 species\n", " - Species 2336 is in community 12\n", "Community 13: 1 species\n", "Community 14: 4 species\n", "Community 15: 2 species\n", "Community 16: 4 species\n", "Community 17: 1 species\n" ] } ], "source": [ "for i, community in enumerate(communities):\n", " print(f'Community {i+1}: {len(community):,} species')\n", " if 2336 in community:\n", " print(f' - Species 2336 is in community {i+1}')\n", " if 58290 in community:\n", " print(f' - Species 58290 is in community {i+1}')\n", " if 565033 in community:\n", " print(f' - Species 565033 is in community {i+1}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }