{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import collections\n", "import os\n", "import json\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import tensorflow as tf\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from tensorflow.keras.models import load_model\n", "import statsmodels.api as sm\n", "\n", "if os.getcwd().endswith('notebook'):\n", " os.chdir('..')\n", "\n", "from rna_learn.transform import (\n", " sequence_embedding, \n", " normalize, denormalize,\n", " make_dataset_balanced,\n", " one_hot_encode_classes,\n", " split_train_test_set,\n", ")\n", "from rna_learn.load import load_mrna_model, load_dataset\n", "from rna_learn.model import conv1d_regression_model, compile_regression_model" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "sns.set(palette='colorblind', font_scale=1.3)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "alphabet = ['A', 'T', 'G', 'C']\n", "classes = ['psychrophilic', 'mesophilic', 'thermophilic']" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "run_id = 'run_5vu8k'\n", "model_path = os.path.join(os.getcwd(), f'saved_models_regression/{run_id}/model.h5')\n", "hyperparameters_path = os.path.join(os.getcwd(), f'saved_models_regression/{run_id}/metadata.json')\n", "test_set_path = os.path.join(os.getcwd(), 'data/dataset_test.csv')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'run_id': 'run_5vu8k',\n", " 'alphabet': ['A', 'T', 'G', 'C'],\n", " 'classes': ['psychrophilic', 'mesophilic', 'thermophilic'],\n", " 'n_epochs': 41,\n", " 'n_conv_1': 3,\n", " 'n_filters_1': 88,\n", " 'kernel_size_1': 29,\n", " 'n_conv_2': 1,\n", " 'n_filters_2': 54,\n", " 'kernel_size_2': 44,\n", " 'l2_reg': 0.0001,\n", " 'dropout': 0.5,\n", " 'val_loss': 1.00002932969929,\n", " 'val_mae': 12.764189720153809}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open(hyperparameters_path) as f:\n", " metadata = json.load(f)\n", " \n", "metadata" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"model\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "sequence (InputLayer) [(None, None, 4)] 0 \n", "_________________________________________________________________\n", "conv1d (Conv1D) (None, None, 88) 10296 \n", "_________________________________________________________________\n", "conv1d_1 (Conv1D) (None, None, 88) 224664 \n", "_________________________________________________________________\n", "conv1d_2 (Conv1D) (None, None, 88) 224664 \n", "_________________________________________________________________\n", "conv1d_3 (Conv1D) (None, None, 54) 209142 \n", "_________________________________________________________________\n", "global_average_pooling1d (Gl (None, 54) 0 \n", "_________________________________________________________________\n", "dropout (Dropout) (None, 54) 0 \n", "_________________________________________________________________\n", "dense (Dense) (None, 2) 110 \n", "_________________________________________________________________\n", "independent_normal (Independ ((None, 1), (None, 1)) 0 \n", "=================================================================\n", "Total params: 668,876\n", "Trainable params: 668,876\n", "Non-trainable params: 0\n", "_________________________________________________________________\n" ] } ], "source": [ "model = conv1d_regression_model(\n", " alphabet_size=len(alphabet), \n", " n_conv_1=metadata['n_conv_1'],\n", " n_filters_1=metadata['n_filters_1'], \n", " kernel_size_1=metadata['kernel_size_1'],\n", " n_conv_2=metadata['n_conv_2'],\n", " n_filters_2=metadata['n_filters_2'], \n", " kernel_size_2=metadata['kernel_size_2'],\n", " l2_reg=metadata['l2_reg'],\n", " dropout=metadata['dropout'],\n", ")\n", "compile_regression_model(model, learning_rate=1e-4)\n", "model.load_weights(model_path)\n", "model.summary()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | specie_name | \n", "seqid | \n", "gene_name | \n", "start_inclusive | \n", "end_exclusive | \n", "length | \n", "strand | \n", "temperature | \n", "temperature_range | \n", "sequence | \n", "gc_content | \n", "ag_content | \n", "gt_content | \n", "secondary_structure | \n", "paired_nucleotides | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "Micropolyspora internatus | \n", "NC_013159.1 | \n", "rnpB | \n", "742238 | \n", "742645 | \n", "407 | \n", "+ | \n", "45.0 | \n", "thermophilic | \n", "CGAGTTGGCAGGGCGGCCGCGGCCGAGGGCATCGTCTCGACGTCTT... | \n", "0.685504 | \n", "0.562654 | \n", "0.508600 | \n", "((((((((((((((..((.(((((((((((.(((...))).)))))... | \n", "0.707617 | \n", "
1 | \n", "Listonella anguillarum | \n", "NC_015633.1 | \n", "recA | \n", "2753532 | \n", "2754579 | \n", "1047 | \n", "- | \n", "20.0 | \n", "psychrophilic | \n", "ATGGACGAAAATAAGCAGAAGGCGCTAGCCGCAGCACTGGGTCAAA... | \n", "0.442216 | \n", "0.540592 | \n", "0.510029 | \n", "....................(((....)))(((((..(((((((((... | \n", "0.617001 | \n", "
2 | \n", "Comamonas badia | \n", "NZ_AXVM01000006.1 | \n", "rpsR | \n", "29593 | \n", "29878 | \n", "285 | \n", "- | \n", "28.0 | \n", "mesophilic | \n", "TTGACCATGTTCAGGAAATTCAACAAGAATGGCAAGAACGGCAAGC... | \n", "0.568421 | \n", "0.494737 | \n", "0.414035 | \n", "(((.((..((((..(..((((.....))))..)..)))))))))((... | \n", "0.610526 | \n", "
3 | \n", "Acetobacter orientalis | \n", "NZ_BAMX01000009.1 | \n", "tsaD | \n", "11584 | \n", "12799 | \n", "1215 | \n", "- | \n", "30.0 | \n", "mesophilic | \n", "ATGGCGGTCAGCAGCCAGTTTTCAGGCTTACCCGGCACCCCTCACA... | \n", "0.596708 | \n", "0.469959 | \n", "0.476543 | \n", ".(((.((((...((((........)))).....(((............. | \n", "0.627160 | \n", "
4 | \n", "Alicyclobacillus kakegawensis | \n", "NZ_BCRP01000001.1 | \n", "tsaD | \n", "64880 | \n", "65918 | \n", "1038 | \n", "- | \n", "50.0 | \n", "thermophilic | \n", "TTGCTCCTGTTGGGCATTGAGACGAGTTGCGACGAGACCGCCGCGG... | \n", "0.680154 | \n", "0.516378 | \n", "0.564547 | \n", "......((((.(((((.((...((....((((((.(((((.(((((... | \n", "0.697495 | \n", "