In [11]:
import configparser
import math

import psycopg2
import pandas
from neo4j import GraphDatabase
import tqdm

import hetio.readwrite
import hetio.neo4j

In [12]:
parser = configparser.ConfigParser()
parser.read('database.ini')

db_password = parser['psql']['password']

In [13]:
epilepsy_id = 'DOID:1826'

# Get top ten most important metapaths for epilepsy (which are all compound-disease pairs)
query = f'''SELECT outer_pc.dwpc as dwpc, outer_pc.p_value as p_value, outer_pc.metapath_id as metapath_id, 
                  top_ids.source_name as source_name, top_ids.target_name as target_name 
FROM 
    (SELECT dwpc, p_value, metapath_id, source_id, target_id, n1.name AS source_name, n2.name AS target_name 
     FROM dj_hetmech_app_pathcount pc 
     JOIN dj_hetmech_app_node join_node  
         ON pc.target_id=join_node.id OR pc.source_id=join_node.id 
     JOIN dj_hetmech_app_node n1 
         ON pc.source_id = n1.id 
     JOIN dj_hetmech_app_node n2 
         ON pc.target_id = n2.id     
     WHERE join_node.identifier='{epilepsy_id}' 
     ORDER BY pc.p_value) AS top_ids 
JOIN dj_hetmech_app_pathcount outer_pc 
     ON (top_ids.source_id = outer_pc.source_id AND 
         top_ids.target_id = outer_pc.target_id) OR 
         (top_ids.source_id = outer_pc.target_id AND 
         top_ids.target_id = outer_pc.source_id)
ORDER BY outer_pc.p_value;
'''

connection = psycopg2.connect(host = 'hetmech-db-dev.cobepk65dd7j.us-east-1.rds.amazonaws.com', 
                              database = 'dj_hetmech', user = 'read_only_user', password = db_password)

top_metapaths = pandas.read_sql(query, connection)

In [14]:
top_metapaths = top_metapaths.sort_values(by=['source_name', 'metapath_id'])

# Ensure that you only have one copy of each (source_name, metapath_id) pair
top_metapaths = top_metapaths.drop_duplicates(subset=['source_name', 'metapath_id'])

top_metapaths = top_metapaths.sort_values(by='p_value')

# Remove any rows with NaN values
top_metapaths = top_metapaths.dropna()

min_p_value = top_metapaths[top_metapaths.p_value != 0].p_value.min()

top_metapaths.loc[top_metapaths.p_value == 0, 'p_value'] = min_p_value
print(top_metapaths.p_value.min())

top_metapaths['neg_log_p_value'] = top_metapaths.p_value.apply(lambda x: -math.log10(x))

top_metapaths.head()

3.13181113155575e-17


Unnamed: 0,dwpc,p_value,metapath_id,source_name,target_name,neg_log_p_value
0,3.509434,3.1318110000000004e-17,CcSEcCtD,Nitrazepam,epilepsy syndrome,16.504204
9,3.296422,5.733828e-17,CcSEcCtD,Bromazepam,epilepsy syndrome,16.241555
16,3.579689,7.03284e-17,CcSEcCtD,Lorazepam,epilepsy syndrome,16.152869
28,3.369589,7.210640000000001e-17,CcSEcCtD,Phenobarbital,epilepsy syndrome,16.142026
34,3.346266,2.518406e-16,CcSEcCtD,Ezogabine,epilepsy syndrome,15.598874


In [15]:
url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0-metagraph.json'

metagraph = hetio.readwrite.read_metagraph(url)

In [16]:
def get_paths_for_metapath(metagraph, row):
    '''
    Return a list of dictionaries containing the information for all paths with a given source, target, and metapath
    
    Parameters
    ----------
    metagraph : a hetio.hetnet.Metagraph instance to interpret metapath abbreviations
    row : a row from a pandas dataframe with information about the given metapath, source, and target
    '''
    damping_exponent = .5
    
    metapath_data = metagraph.metapath_from_abbrev(row['metapath_id'])

    query = hetio.neo4j.construct_pdp_query(metapath_data, path_style='string', property='name')

    driver = GraphDatabase.driver("bolt://neo4j.het.io")
    params = {
        'source': row['source_name'],
        'target': row['target_name'],
        'w': damping_exponent
    }
    with driver.session() as session:
        metapath_result = session.run(query, params)
        metapath_result = metapath_result.data()

    for path in metapath_result:
        path['metapath'] = row['metapath_id']
        path['metapath_importance'] = row['neg_log_p_value']
        path['path_importance'] = path['metapath_importance'] * path['percent_of_DWPC']
        path['source'] = row['source_name']
    
    metapath_df = pandas.DataFrame(metapath_result)
        
    return metapath_df

In [17]:
# For row in top_metapaths

result_list = []

for index, row in tqdm.tqdm_notebook(top_metapaths.iterrows(), total=len(top_metapaths.index)):
    metapath_df = get_paths_for_metapath(metagraph, row)
    result_list.append(metapath_df)
result_df = pandas.concat(result_list, ignore_index=True)

HBox(children=(IntProgress(value=0, max=6740), HTML(value='')))




In [25]:
result_df = result_df.sort_values(by=['source', 'path_importance', 'metapath'], ascending=[True, False, True])
result_df.head()

Unnamed: 0,PDP,metapath,metapath_importance,path,path_importance,percent_of_DWPC,source
1181486,0.0118,CbGaD,1.968488,Abacavir–ADK–epilepsy syndrome,196.848818,100.0,Abacavir
1133753,0.000303,CbGdAlD,2.186818,Abacavir–ADH6–telencephalon–epilepsy syndrome,68.693397,31.412493,Abacavir
1133754,0.000155,CbGdAlD,2.186818,Abacavir–ADH6–medulla oblongata–epilepsy syndrome,35.187079,16.09054,Abacavir
1133755,0.000153,CbGdAlD,2.186818,Abacavir–ADH6–cerebellum–epilepsy syndrome,34.732125,15.882496,Abacavir
1410458,0.000846,CtDdGaD,1.10662,Abacavir–acquired immunodeficiency syndrome–HS...,24.962412,22.557351,Abacavir


In [27]:
result_df.to_csv('data/epilepsy_paths.tsv.gz', index=False, sep='\t')