{ "cells": [ { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "import argparse\n", "import os\n", "import json\n", "import logging\n", "import string\n", "import collections\n", "\n", "import pandas as pd\n", "import numpy as np\n", "from sqlalchemy import create_engine\n", "import networkx as nx\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "if os.getcwd().endswith('notebook'):\n", " os.chdir('..')\n", "\n", "from rna_learn.alphabet import CODON_REDUNDANCY\n", "from rna_learn.codon_bias.graph import load_codon_bias" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "sns.set(palette='colorblind', font_scale=1.3)\n", "palette = sns.color_palette()\n", "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s (%(levelname)s) %(message)s\")\n", "logger = logging.getLogger(__name__)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "db_path = os.path.join(os.getcwd(), 'data/db/seq.db')\n", "engine = create_engine(f'sqlite+pysqlite:///{db_path}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load distance matrix" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2680, 2680)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distance_matrix_path = os.path.join(os.getcwd(), 'data/distance_matrix.npy')\n", "distance_matrix = np.load(distance_matrix_path, allow_pickle=True)\n", "distance_matrix.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load codon bias" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | species_taxid | \n", "in_test_set | \n", "AAA_ratio | \n", "AAG_ratio | \n", "AAT_ratio | \n", "AAC_ratio | \n", "ACT_ratio | \n", "ACC_ratio | \n", "ACA_ratio | \n", "ACG_ratio | \n", "... | \n", "motility | \n", "range_salinity | \n", "cell_shape | \n", "isolation_source | \n", "doubling_h | \n", "genome_size | \n", "gc_content | \n", "coding_genes | \n", "tRNA_genes | \n", "rRNA16S_genes | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "7 | \n", "True | \n", "0.082550 | \n", "0.917450 | \n", "0.327724 | \n", "0.672276 | \n", "0.024511 | \n", "0.575353 | \n", "0.036262 | \n", "0.363875 | \n", "... | \n", "yes | \n", "None | \n", "None | \n", "host_plant | \n", "NaN | \n", "5369771.500 | \n", "67.300 | \n", "4713.667 | \n", "53.000 | \n", "3.0 | \n", "
1 | \n", "9 | \n", "False | \n", "0.919627 | \n", "0.080373 | \n", "0.858168 | \n", "0.141832 | \n", "0.454569 | \n", "0.050026 | \n", "0.446018 | \n", "0.049387 | \n", "... | \n", "yes | \n", "None | \n", "coccobacillus | \n", "host_animal_ectotherm | \n", "35.40 | \n", "601699.243 | \n", "25.469 | \n", "517.549 | \n", "30.485 | \n", "1.0 | \n", "
2 | \n", "11 | \n", "True | \n", "0.007302 | \n", "0.992698 | \n", "0.010730 | \n", "0.989270 | \n", "0.008586 | \n", "0.425527 | \n", "0.015789 | \n", "0.550098 | \n", "... | \n", "yes | \n", "None | \n", "bacillus | \n", "host_animal_endotherm | \n", "NaN | \n", "3526440.800 | \n", "73.805 | \n", "3139.333 | \n", "45.000 | \n", "2.0 | \n", "
3 | \n", "14 | \n", "True | \n", "0.664866 | \n", "0.335134 | \n", "0.775571 | \n", "0.224429 | \n", "0.458203 | \n", "0.188052 | \n", "0.304183 | \n", "0.049563 | \n", "... | \n", "no | \n", "None | \n", "bacillus | \n", "water_hotspring | \n", "2.47 | \n", "1959987.600 | \n", "33.700 | \n", "1876.333 | \n", "46.000 | \n", "2.0 | \n", "
4 | \n", "19 | \n", "True | \n", "0.551810 | \n", "0.448190 | \n", "0.440559 | \n", "0.559441 | \n", "0.099134 | \n", "0.591478 | \n", "0.097796 | \n", "0.211592 | \n", "... | \n", "yes | \n", "None | \n", "bacillus | \n", "petroleum | \n", "NaN | \n", "3722544.667 | \n", "55.100 | \n", "3222.667 | \n", "54.000 | \n", "2.0 | \n", "
5 rows × 86 columns
\n", "\n", " | species_taxid | \n", "species | \n", "phylum | \n", "superkingdom | \n", "
---|---|---|---|---|
683 | \n", "2337 | \n", "Thermotoga neapolitana | \n", "Thermotogae | \n", "Bacteria | \n", "
1190 | \n", "57487 | \n", "Pseudothermotoga hypogea | \n", "Thermotogae | \n", "Bacteria | \n", "
1201 | \n", "58290 | \n", "Archaeoglobus veneficus | \n", "Euryarchaeota | \n", "Archaea | \n", "
1463 | \n", "93929 | \n", "Thermotoga petrophila | \n", "Thermotogae | \n", "Bacteria | \n", "
1464 | \n", "93930 | \n", "Thermotoga naphthophila | \n", "Thermotogae | \n", "Bacteria | \n", "
2334 | \n", "565033 | \n", "Geoglobus acetivorans | \n", "Euryarchaeota | \n", "Archaea | \n", "
2529 | \n", "1184387 | \n", "Mesotoga prima | \n", "Thermotogae | \n", "Bacteria | \n", "
2558 | \n", "1236046 | \n", "Mesotoga infera | \n", "Thermotogae | \n", "Bacteria | \n", "