{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import configparser\n",
"import math\n",
"\n",
"import psycopg2\n",
"import pandas\n",
"from neo4j import GraphDatabase\n",
"import tqdm\n",
"\n",
"import hetio.readwrite\n",
"import hetio.neo4j"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"parser = configparser.ConfigParser()\n",
"parser.read('database.ini')\n",
"\n",
"db_password = parser['psql']['password']"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"epilepsy_id = 'DOID:1826'\n",
"\n",
"# Get top ten most important metapaths for epilepsy (which are all compound-disease pairs)\n",
"query = f'''SELECT outer_pc.dwpc as dwpc, outer_pc.p_value as p_value, outer_pc.metapath_id as metapath_id, \n",
" top_ids.source_name as source_name, top_ids.target_name as target_name \n",
"FROM \n",
" (SELECT dwpc, p_value, metapath_id, source_id, target_id, n1.name AS source_name, n2.name AS target_name \n",
" FROM dj_hetmech_app_pathcount pc \n",
" JOIN dj_hetmech_app_node join_node \n",
" ON pc.target_id=join_node.id OR pc.source_id=join_node.id \n",
" JOIN dj_hetmech_app_node n1 \n",
" ON pc.source_id = n1.id \n",
" JOIN dj_hetmech_app_node n2 \n",
" ON pc.target_id = n2.id \n",
" WHERE join_node.identifier='{epilepsy_id}' \n",
" ORDER BY pc.p_value) AS top_ids \n",
"JOIN dj_hetmech_app_pathcount outer_pc \n",
" ON (top_ids.source_id = outer_pc.source_id AND \n",
" top_ids.target_id = outer_pc.target_id) OR \n",
" (top_ids.source_id = outer_pc.target_id AND \n",
" top_ids.target_id = outer_pc.source_id)\n",
"ORDER BY outer_pc.p_value;\n",
"'''\n",
"\n",
"connection = psycopg2.connect(host = 'hetmech-db-dev.cobepk65dd7j.us-east-1.rds.amazonaws.com', \n",
" database = 'dj_hetmech', user = 'read_only_user', password = db_password)\n",
"\n",
"top_metapaths = pandas.read_sql(query, connection)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.13181113155575e-17\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" dwpc | \n",
" p_value | \n",
" metapath_id | \n",
" source_name | \n",
" target_name | \n",
" neg_log_p_value | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 3.509434 | \n",
" 3.131811e-17 | \n",
" CcSEcCtD | \n",
" Nitrazepam | \n",
" epilepsy syndrome | \n",
" 16.504204 | \n",
"
\n",
" \n",
" 9 | \n",
" 3.296422 | \n",
" 5.733828e-17 | \n",
" CcSEcCtD | \n",
" Bromazepam | \n",
" epilepsy syndrome | \n",
" 16.241555 | \n",
"
\n",
" \n",
" 16 | \n",
" 3.579689 | \n",
" 7.032840e-17 | \n",
" CcSEcCtD | \n",
" Lorazepam | \n",
" epilepsy syndrome | \n",
" 16.152869 | \n",
"
\n",
" \n",
" 28 | \n",
" 3.369589 | \n",
" 7.210640e-17 | \n",
" CcSEcCtD | \n",
" Phenobarbital | \n",
" epilepsy syndrome | \n",
" 16.142026 | \n",
"
\n",
" \n",
" 34 | \n",
" 3.346266 | \n",
" 2.518406e-16 | \n",
" CcSEcCtD | \n",
" Ezogabine | \n",
" epilepsy syndrome | \n",
" 15.598874 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" dwpc p_value metapath_id source_name target_name \\\n",
"0 3.509434 3.131811e-17 CcSEcCtD Nitrazepam epilepsy syndrome \n",
"9 3.296422 5.733828e-17 CcSEcCtD Bromazepam epilepsy syndrome \n",
"16 3.579689 7.032840e-17 CcSEcCtD Lorazepam epilepsy syndrome \n",
"28 3.369589 7.210640e-17 CcSEcCtD Phenobarbital epilepsy syndrome \n",
"34 3.346266 2.518406e-16 CcSEcCtD Ezogabine epilepsy syndrome \n",
"\n",
" neg_log_p_value \n",
"0 16.504204 \n",
"9 16.241555 \n",
"16 16.152869 \n",
"28 16.142026 \n",
"34 15.598874 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"top_metapaths = top_metapaths.sort_values(by=['source_name', 'metapath_id'])\n",
"\n",
"# Ensure that you only have one copy of each (source_name, metapath_id) pair\n",
"top_metapaths = top_metapaths.drop_duplicates(subset=['source_name', 'metapath_id'])\n",
"\n",
"top_metapaths = top_metapaths.sort_values(by='p_value')\n",
"\n",
"# Remove any rows with NaN values\n",
"top_metapaths = top_metapaths.dropna()\n",
"\n",
"min_p_value = top_metapaths[top_metapaths.p_value != 0].p_value.min()\n",
"\n",
"top_metapaths.loc[top_metapaths.p_value == 0, 'p_value'] = min_p_value\n",
"print(top_metapaths.p_value.min())\n",
"\n",
"top_metapaths['neg_log_p_value'] = top_metapaths.p_value.apply(lambda x: -math.log10(x))\n",
"\n",
"top_metapaths.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0-metagraph.json'\n",
"\n",
"metagraph = hetio.readwrite.read_metagraph(url)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def get_paths_for_metapath(metagraph, row):\n",
" '''\n",
" Return a list of dictionaries containing the information for all paths with a given source, target, and metapath\n",
" \n",
" Parameters\n",
" ----------\n",
" metagraph : a hetio.hetnet.Metagraph instance to interpret metapath abbreviations\n",
" row : a row from a pandas dataframe with information about the given metapath, source, and target\n",
" '''\n",
" damping_exponent = .5\n",
" \n",
" metapath_data = metagraph.metapath_from_abbrev(row['metapath_id'])\n",
"\n",
" query = hetio.neo4j.construct_pdp_query(metapath_data, path_style='string', property='name')\n",
"\n",
" driver = GraphDatabase.driver(\"bolt://neo4j.het.io\")\n",
" params = {\n",
" 'source': row['source_name'],\n",
" 'target': row['target_name'],\n",
" 'w': damping_exponent\n",
" }\n",
" with driver.session() as session:\n",
" metapath_result = session.run(query, params)\n",
" metapath_result = metapath_result.data()\n",
"\n",
" for path in metapath_result:\n",
" path['metapath'] = row['metapath_id']\n",
" path['metapath_importance'] = row['neg_log_p_value']\n",
" path['path_importance'] = path['metapath_importance'] * path['percent_of_DWPC']\n",
" path['source'] = row['source_name']\n",
" \n",
" metapath_df = pandas.DataFrame(metapath_result)\n",
" \n",
" return metapath_df"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "595b8e153cad41a3a901b55e045fbb3f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=6740), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# For row in top_metapaths\n",
"\n",
"result_list = []\n",
"\n",
"for index, row in tqdm.tqdm_notebook(top_metapaths.iterrows(), total=len(top_metapaths.index)):\n",
" metapath_df = get_paths_for_metapath(metagraph, row)\n",
" result_list.append(metapath_df)\n",
"result_df = pandas.concat(result_list, ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PDP | \n",
" metapath | \n",
" metapath_importance | \n",
" path | \n",
" path_importance | \n",
" percent_of_DWPC | \n",
" source | \n",
"
\n",
" \n",
" \n",
" \n",
" 1181486 | \n",
" 0.011800 | \n",
" CbGaD | \n",
" 1.968488 | \n",
" Abacavir–ADK–epilepsy syndrome | \n",
" 196.848818 | \n",
" 100.000000 | \n",
" Abacavir | \n",
"
\n",
" \n",
" 1133753 | \n",
" 0.000303 | \n",
" CbGdAlD | \n",
" 2.186818 | \n",
" Abacavir–ADH6–telencephalon–epilepsy syndrome | \n",
" 68.693397 | \n",
" 31.412493 | \n",
" Abacavir | \n",
"
\n",
" \n",
" 1133754 | \n",
" 0.000155 | \n",
" CbGdAlD | \n",
" 2.186818 | \n",
" Abacavir–ADH6–medulla oblongata–epilepsy syndrome | \n",
" 35.187079 | \n",
" 16.090540 | \n",
" Abacavir | \n",
"
\n",
" \n",
" 1133755 | \n",
" 0.000153 | \n",
" CbGdAlD | \n",
" 2.186818 | \n",
" Abacavir–ADH6–cerebellum–epilepsy syndrome | \n",
" 34.732125 | \n",
" 15.882496 | \n",
" Abacavir | \n",
"
\n",
" \n",
" 1410458 | \n",
" 0.000846 | \n",
" CtDdGaD | \n",
" 1.106620 | \n",
" Abacavir–acquired immunodeficiency syndrome–HS... | \n",
" 24.962412 | \n",
" 22.557351 | \n",
" Abacavir | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PDP metapath metapath_importance \\\n",
"1181486 0.011800 CbGaD 1.968488 \n",
"1133753 0.000303 CbGdAlD 2.186818 \n",
"1133754 0.000155 CbGdAlD 2.186818 \n",
"1133755 0.000153 CbGdAlD 2.186818 \n",
"1410458 0.000846 CtDdGaD 1.106620 \n",
"\n",
" path path_importance \\\n",
"1181486 Abacavir–ADK–epilepsy syndrome 196.848818 \n",
"1133753 Abacavir–ADH6–telencephalon–epilepsy syndrome 68.693397 \n",
"1133754 Abacavir–ADH6–medulla oblongata–epilepsy syndrome 35.187079 \n",
"1133755 Abacavir–ADH6–cerebellum–epilepsy syndrome 34.732125 \n",
"1410458 Abacavir–acquired immunodeficiency syndrome–HS... 24.962412 \n",
"\n",
" percent_of_DWPC source \n",
"1181486 100.000000 Abacavir \n",
"1133753 31.412493 Abacavir \n",
"1133754 16.090540 Abacavir \n",
"1133755 15.882496 Abacavir \n",
"1410458 22.557351 Abacavir "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_df = result_df.sort_values(by=['source', 'path_importance', 'metapath'], ascending=[True, False, True])\n",
"result_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"result_df.to_csv('data/epilepsy_paths.tsv.gz', index=False, sep='\\t')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:hetmech]",
"language": "python",
"name": "conda-env-hetmech-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}