{ "cells": [ { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import configparser\n", "import math\n", "\n", "import psycopg2\n", "import pandas\n", "from neo4j import GraphDatabase\n", "import tqdm\n", "\n", "import hetio.readwrite\n", "import hetio.neo4j" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "parser = configparser.ConfigParser()\n", "parser.read('database.ini')\n", "\n", "db_password = parser['psql']['password']" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "epilepsy_id = 'DOID:1826'\n", "\n", "# Get top ten most important metapaths for epilepsy (which are all compound-disease pairs)\n", "query = f'''SELECT outer_pc.dwpc as dwpc, outer_pc.p_value as p_value, outer_pc.metapath_id as metapath_id, \n", " top_ids.source_name as source_name, top_ids.target_name as target_name \n", "FROM \n", " (SELECT dwpc, p_value, metapath_id, source_id, target_id, n1.name AS source_name, n2.name AS target_name \n", " FROM dj_hetmech_app_pathcount pc \n", " JOIN dj_hetmech_app_node join_node \n", " ON pc.target_id=join_node.id OR pc.source_id=join_node.id \n", " JOIN dj_hetmech_app_node n1 \n", " ON pc.source_id = n1.id \n", " JOIN dj_hetmech_app_node n2 \n", " ON pc.target_id = n2.id \n", " WHERE join_node.identifier='{epilepsy_id}' \n", " ORDER BY pc.p_value) AS top_ids \n", "JOIN dj_hetmech_app_pathcount outer_pc \n", " ON (top_ids.source_id = outer_pc.source_id AND \n", " top_ids.target_id = outer_pc.target_id) OR \n", " (top_ids.source_id = outer_pc.target_id AND \n", " top_ids.target_id = outer_pc.source_id)\n", "ORDER BY outer_pc.p_value;\n", "'''\n", "\n", "connection = psycopg2.connect(host = 'hetmech-db-dev.cobepk65dd7j.us-east-1.rds.amazonaws.com', \n", " database = 'dj_hetmech', user = 'read_only_user', password = db_password)\n", "\n", "top_metapaths = pandas.read_sql(query, connection)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.13181113155575e-17\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dwpcp_valuemetapath_idsource_nametarget_nameneg_log_p_value
03.5094343.131811e-17CcSEcCtDNitrazepamepilepsy syndrome16.504204
93.2964225.733828e-17CcSEcCtDBromazepamepilepsy syndrome16.241555
163.5796897.032840e-17CcSEcCtDLorazepamepilepsy syndrome16.152869
283.3695897.210640e-17CcSEcCtDPhenobarbitalepilepsy syndrome16.142026
343.3462662.518406e-16CcSEcCtDEzogabineepilepsy syndrome15.598874
\n", "
" ], "text/plain": [ " dwpc p_value metapath_id source_name target_name \\\n", "0 3.509434 3.131811e-17 CcSEcCtD Nitrazepam epilepsy syndrome \n", "9 3.296422 5.733828e-17 CcSEcCtD Bromazepam epilepsy syndrome \n", "16 3.579689 7.032840e-17 CcSEcCtD Lorazepam epilepsy syndrome \n", "28 3.369589 7.210640e-17 CcSEcCtD Phenobarbital epilepsy syndrome \n", "34 3.346266 2.518406e-16 CcSEcCtD Ezogabine epilepsy syndrome \n", "\n", " neg_log_p_value \n", "0 16.504204 \n", "9 16.241555 \n", "16 16.152869 \n", "28 16.142026 \n", "34 15.598874 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_metapaths = top_metapaths.sort_values(by=['source_name', 'metapath_id'])\n", "\n", "# Ensure that you only have one copy of each (source_name, metapath_id) pair\n", "top_metapaths = top_metapaths.drop_duplicates(subset=['source_name', 'metapath_id'])\n", "\n", "top_metapaths = top_metapaths.sort_values(by='p_value')\n", "\n", "# Remove any rows with NaN values\n", "top_metapaths = top_metapaths.dropna()\n", "\n", "min_p_value = top_metapaths[top_metapaths.p_value != 0].p_value.min()\n", "\n", "top_metapaths.loc[top_metapaths.p_value == 0, 'p_value'] = min_p_value\n", "print(top_metapaths.p_value.min())\n", "\n", "top_metapaths['neg_log_p_value'] = top_metapaths.p_value.apply(lambda x: -math.log10(x))\n", "\n", "top_metapaths.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0-metagraph.json'\n", "\n", "metagraph = hetio.readwrite.read_metagraph(url)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def get_paths_for_metapath(metagraph, row):\n", " '''\n", " Return a list of dictionaries containing the information for all paths with a given source, target, and metapath\n", " \n", " Parameters\n", " ----------\n", " metagraph : a hetio.hetnet.Metagraph instance to interpret metapath abbreviations\n", " row : a row from a pandas dataframe with information about the given metapath, source, and target\n", " '''\n", " damping_exponent = .5\n", " \n", " metapath_data = metagraph.metapath_from_abbrev(row['metapath_id'])\n", "\n", " query = hetio.neo4j.construct_pdp_query(metapath_data, path_style='string', property='name')\n", "\n", " driver = GraphDatabase.driver(\"bolt://neo4j.het.io\")\n", " params = {\n", " 'source': row['source_name'],\n", " 'target': row['target_name'],\n", " 'w': damping_exponent\n", " }\n", " with driver.session() as session:\n", " metapath_result = session.run(query, params)\n", " metapath_result = metapath_result.data()\n", "\n", " for path in metapath_result:\n", " path['metapath'] = row['metapath_id']\n", " path['metapath_importance'] = row['neg_log_p_value']\n", " path['path_importance'] = path['metapath_importance'] * path['percent_of_DWPC']\n", " path['source'] = row['source_name']\n", " \n", " metapath_df = pandas.DataFrame(metapath_result)\n", " \n", " return metapath_df" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "595b8e153cad41a3a901b55e045fbb3f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=6740), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# For row in top_metapaths\n", "\n", "result_list = []\n", "\n", "for index, row in tqdm.tqdm_notebook(top_metapaths.iterrows(), total=len(top_metapaths.index)):\n", " metapath_df = get_paths_for_metapath(metagraph, row)\n", " result_list.append(metapath_df)\n", "result_df = pandas.concat(result_list, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PDPmetapathmetapath_importancepathpath_importancepercent_of_DWPCsource
11814860.011800CbGaD1.968488Abacavir–ADK–epilepsy syndrome196.848818100.000000Abacavir
11337530.000303CbGdAlD2.186818Abacavir–ADH6–telencephalon–epilepsy syndrome68.69339731.412493Abacavir
11337540.000155CbGdAlD2.186818Abacavir–ADH6–medulla oblongata–epilepsy syndrome35.18707916.090540Abacavir
11337550.000153CbGdAlD2.186818Abacavir–ADH6–cerebellum–epilepsy syndrome34.73212515.882496Abacavir
14104580.000846CtDdGaD1.106620Abacavir–acquired immunodeficiency syndrome–HS...24.96241222.557351Abacavir
\n", "
" ], "text/plain": [ " PDP metapath metapath_importance \\\n", "1181486 0.011800 CbGaD 1.968488 \n", "1133753 0.000303 CbGdAlD 2.186818 \n", "1133754 0.000155 CbGdAlD 2.186818 \n", "1133755 0.000153 CbGdAlD 2.186818 \n", "1410458 0.000846 CtDdGaD 1.106620 \n", "\n", " path path_importance \\\n", "1181486 Abacavir–ADK–epilepsy syndrome 196.848818 \n", "1133753 Abacavir–ADH6–telencephalon–epilepsy syndrome 68.693397 \n", "1133754 Abacavir–ADH6–medulla oblongata–epilepsy syndrome 35.187079 \n", "1133755 Abacavir–ADH6–cerebellum–epilepsy syndrome 34.732125 \n", "1410458 Abacavir–acquired immunodeficiency syndrome–HS... 24.962412 \n", "\n", " percent_of_DWPC source \n", "1181486 100.000000 Abacavir \n", "1133753 31.412493 Abacavir \n", "1133754 16.090540 Abacavir \n", "1133755 15.882496 Abacavir \n", "1410458 22.557351 Abacavir " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result_df = result_df.sort_values(by=['source', 'path_importance', 'metapath'], ascending=[True, False, True])\n", "result_df.head()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "result_df.to_csv('data/epilepsy_paths.tsv.gz', index=False, sep='\\t')" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:hetmech]", "language": "python", "name": "conda-env-hetmech-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }