{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import collections\n", "import os\n", "import json\n", "import logging\n", "import string\n", "import re\n", "import gzip\n", "\n", "from scipy.stats import entropy\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sqlalchemy import create_engine\n", "import statsmodels.api as sm\n", "from statsmodels.sandbox.regression.predstd import wls_prediction_std\n", "import networkx as nx\n", "from Bio import SeqIO\n", "\n", "if os.getcwd().endswith('notebook'):\n", " os.chdir('..')\n", "\n", "from rna_learn.codon_bias.graph import load_codon_bias" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "sns.set(palette='colorblind', font_scale=1.3)\n", "palette = sns.color_palette()\n", "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s (%(levelname)s) %(message)s\")\n", "logger = logging.getLogger(__name__)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "db_path = os.path.join(os.getcwd(), 'data/db/seq.db')\n", "engine = create_engine(f'sqlite+pysqlite:///{db_path}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Assign Gene Ontology (GO) labels from Pfam labels" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "pfam2go_path = os.path.join(os.getcwd(), 'data/domains/Pfam2go.txt')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def parse_pfam_to_go_file(path):\n", " line_re = r'^Pfam:([^\\s]+) ([^>]+) > GO:([^;]+) ; GO:([0-9]+)$'\n", " domain_to_go = collections.defaultdict(list)\n", " with open(path, 'r') as f:\n", " for line in f:\n", " if not line.strip() or line.startswith('!'):\n", " continue\n", " \n", " m = re.match(line_re, line)\n", " if m:\n", " pfam_id = m[1].strip()\n", " query = m[2].strip()\n", " go_label = m[3].strip()\n", " go_id = m[4].strip()\n", " \n", " domain_to_go[query].append((go_id, go_label))\n", " \n", " return dict(domain_to_go)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "domain_to_go = parse_pfam_to_go_file(pfam2go_path)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('0003824', 'catalytic activity'),\n", " ('0009236', 'cobalamin biosynthetic process')]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "domain_to_go['GATase_3']" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "pfam_domains_path = os.path.join(\n", " os.getcwd(), \n", " f'data/domains/tri_nucleotide_bias/pfam/GCA_000005825.2_protein_domains.csv',\n", ")\n", "df = pd.read_csv(pfam_domains_path, index_col='protein_id')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
assembly_accessionrecord_typepfam_querypfam_accessionprotein_labelbelow_threshold
protein_id
ADC49551.1GCA_000005825.2pfamNAD_binding_10PF13460.6hypothetical-protein-BpOF4_07465False
ADC49551.1GCA_000005825.2pfamNAD_binding_4PF07993.12hypothetical-protein-BpOF4_07465False
ADC49551.1GCA_000005825.2pfamSemialdhyde_dhPF01118.24hypothetical-protein-BpOF4_07465False
ADC49551.1GCA_000005825.2pfamEpimerasePF01370.21hypothetical-protein-BpOF4_07465False
\n", "
" ], "text/plain": [ " assembly_accession record_type pfam_query pfam_accession \\\n", "protein_id \n", "ADC49551.1 GCA_000005825.2 pfam NAD_binding_10 PF13460.6 \n", "ADC49551.1 GCA_000005825.2 pfam NAD_binding_4 PF07993.12 \n", "ADC49551.1 GCA_000005825.2 pfam Semialdhyde_dh PF01118.24 \n", "ADC49551.1 GCA_000005825.2 pfam Epimerase PF01370.21 \n", "\n", " protein_label below_threshold \n", "protein_id \n", "ADC49551.1 hypothetical-protein-BpOF4_07465 False \n", "ADC49551.1 hypothetical-protein-BpOF4_07465 False \n", "ADC49551.1 hypothetical-protein-BpOF4_07465 False \n", "ADC49551.1 hypothetical-protein-BpOF4_07465 False " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[['ADC49551.1']]" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.542255143837179" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.log10(0.818195847004645/0.02347481222011315)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.709965388637482" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.log10(0.11764705882352941/0.022941176470588232)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }