# Convert DrugCentral relationships to Rephetio identifiers

In [1]:
import urllib
import json

import pandas

## Read DO Slim - the disease subset used for rephetio

In [2]:
url = 'https://github.com/dhimmel/disease-ontology/raw/5cb93c38568536222b0a14fbcb7fb644a348931d/data/slim-terms-prop.tsv'
do_slim = pandas.read_table(url)
do_slim = do_slim[['slim_id', 'slim_name', 'subsumed_id']]
do_slim.head(2)

Unnamed: 0,slim_id,slim_name,subsumed_id
0,DOID:0050156,idiopathic pulmonary fibrosis,DOID:0050156
1,DOID:0050425,restless legs syndrome,DOID:0050425


## Read UniProt to Entrez Gene mapping

In [3]:
url = 'https://github.com/dhimmel/uniprot/raw/5fc60158364d2caf6d4087dad5abba0e8b2ea7db/data/map/GeneID.tsv.gz'
entrez_map_df = pandas.read_table(url, compression='gzip')
entrez_map_df.head(2)

Unnamed: 0,uniprot,GeneID
0,A0A010PZJ8,19039206
1,A0A010PZK3,19039211


## Read DrugBank Slim

In [4]:
url = 'https://github.com/dhimmel/drugbank/raw/55587651ee9417e4621707dac559d84c984cf5fa/data/drugbank-slim.tsv'
drugbank_df = pandas.read_table(url)
drugbank_df = drugbank_df[['drugbank_id', 'name']]
drugbank_df = drugbank_df.rename(columns={'name': 'drugbank_name'})
drugbank_df.head(2)

Unnamed: 0,drugbank_id,drugbank_name
0,DB00014,Goserelin
1,DB00035,Desmopressin


In [5]:
len(drugbank_df)

1552

## Read identifiers

In [6]:
path = 'drugtarget/identifiers.tsv'
id_df = pandas.read_table(path)
id_df = id_df.query("ID_TYPE == 'DRUGBANK_ID'")[['DRUG_ID', 'IDENTIFIER']]
id_df = id_df.rename(columns={'IDENTIFIER': 'drugbank_id'})
drugbank_df = id_df.merge(drugbank_df)
drugbank_df.head(2)

Unnamed: 0,DRUG_ID,drugbank_id,drugbank_name
0,1327,DB00014,Goserelin
1,817,DB00035,Desmopressin


In [7]:
len(drugbank_df)

1634

## Convert drug targets

In [8]:
path = 'drugtarget/drug_target.tsv'
target_df = pandas.read_table(path)
target_df = drugbank_df.merge(target_df)
target_df = target_df[['drugbank_id', 'drugbank_name', 'TARGET_NAME', 'TARGET_FAMILY', 'UNIPROT', 'ACTION_TYPE', 'SOURCE', 'REFERENCE']]

# Split multi-protein targets into many rows
s = target_df.UNIPROT.str.split('|').apply(pandas.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name ='uniprot'
del target_df['UNIPROT']
target_df = target_df.join(s)

target_df = entrez_map_df.merge(target_df)
del target_df['uniprot']

target_df['action'] = target_df['ACTION_TYPE'].str.lower()
del target_df['ACTION_TYPE']

target_df['pubmed_id'] = target_df.REFERENCE.str.extract('pubmed/([0-9]+)')

target_df = target_df.drop_duplicates()
target_df.head(2)

Unnamed: 0,GeneID,drugbank_id,drugbank_name,TARGET_NAME,TARGET_FAMILY,SOURCE,REFERENCE,action,pubmed_id
0,8233868,DB00431,Lindane,GABA-A receptor,Ion channel,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,negative allosteric modulator,
1,8232849,DB08823,Spinosad,Nicotinic acetylcholine receptor,Ion channel,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,agonist,


In [9]:
target_source_map = {
    'CHEMBL': 'DrugCentral (ChEMBL)',
    'SCIENTIFIC LITERATURE': 'DrugCentral (literature)',
    'DRUG LABEL': 'DrugCentral (label)',
    'IUPHAR': 'DrugCentral (IUPHAR)',
    'KEGG DRUG': 'DrugCentral (KEGG DRUG)',
}
target_df.SOURCE = target_df.SOURCE.map(target_source_map)
target_df.SOURCE.value_counts()

DrugCentral (ChEMBL)        2922
DrugCentral (literature)     182
DrugCentral (label)           89
DrugCentral (IUPHAR)          56
DrugCentral (KEGG DRUG)       25
Name: SOURCE, dtype: int64

In [10]:
def condense_targets(df):
    """Condense drug-target relationships."""
    row = pandas.Series()
    row['pubmed_ids'] = '|'.join(sorted(df.pubmed_id.dropna().unique()))
    row['sources'] = '|'.join(sorted(df.SOURCE.unique()))
    row['actions'] = '|'.join(sorted(df.action.unique()))
    row['urls'] = '|'.join(sorted(url for url in df.REFERENCE.unique() if not 'pubmed' in url))
    return row
    
target_df = target_df.groupby(['GeneID', 'drugbank_id', 'drugbank_name']).apply(condense_targets).reset_index()

In [11]:
target_df.to_csv('rephetio/targets.tsv', sep='\t', index=False)

## Read and process DrugCentral Indications

In [12]:
path = 'drugtarget/drug_indication.tsv'
indication_df = pandas.read_table(path, dtype={'SNOMEDCT_CUI': str})
indication_df = indication_df.rename(columns={'DOID': 'subsumed_id'})
indication_df = do_slim.merge(drugbank_df.merge(indication_df))
del indication_df['DRUG_ID']
indication_df = indication_df[['slim_id', 'drugbank_id', 'slim_name', 'drugbank_name']]
indication_df = indication_df.rename(columns={'slim_id': 'doid_id', 'slim_name': 'disease', 'drugbank_name': 'drug'})
indication_df = indication_df.sort_values(['disease', 'drug'])
indication_df = indication_df.drop_duplicates()

#### Compare to PharmacotherapyDB

In [13]:
url = 'https://github.com/dhimmel/indications/raw/11d535ba0884ee56c3cd5756fdfb4985f313bd80/catalog/indications.tsv'
phcoth_df = pandas.read_table(url)
phcoth_df = phcoth_df[['doid_id', 'drugbank_id', 'category']]
indication_df = indication_df.merge(phcoth_df, how='left')
indication_df.head(2)

Unnamed: 0,doid_id,drugbank_id,disease,drug,category
0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,DM
1,DOID:10652,DB00674,Alzheimer's disease,Galantamine,DM


In [14]:
len(indication_df)

671

In [15]:
indication_df.category.value_counts(dropna=False)

DM     359
NaN    210
SYM     77
NOT     25
Name: category, dtype: int64

In [16]:
indication_df.to_csv('rephetio/indications.tsv', sep='\t', index=False)

## Pharmacologic class

In [17]:
path = 'drugtarget/pharm_class.tsv'
class_df = pandas.read_table(path)
class_df = drugbank_df.merge(class_df)
classes_df = class_df[['TYPE', 'CLASS_SOURCE_ID', 'CLASS', 'SOURCE']].drop_duplicates()
class_df = class_df[['drugbank_id', 'drugbank_name', 'CLASS_SOURCE_ID', 'CLASS']]
class_df = class_df.rename(columns={'CLASS_SOURCE_ID': 'class_id', 'CLASS': 'class_name'})
class_df = class_df.drop_duplicates()
class_df.head(2)

Unnamed: 0,drugbank_id,drugbank_name,class_id,class_name
0,DB00014,Goserelin,N0000175655,Gonadotropin Releasing Hormone Receptor Agonist
1,DB00014,Goserelin,N0000175654,Gonadotropin Releasing Hormone Receptor Agonists


In [18]:
# Pharmacologic mappings
len(classes_df)

1262

In [19]:
# Class to Drug mappings
len(class_df)

10959

In [20]:
class_type_map = {
    'MoA': 'Mechanism of Action',
    'PE': 'Physiologic Effect',
    'CS': 'Chemical Structure',
    'EPC': 'FDA Established Pharmacologic Class',
    'PA': 'Pharmacological Action',
    'has role': 'Application',
    'Chemical/Ingredient': 'Chemical/Ingredient',
}

def get_class_url(class_source, class_id):
    """Create URLs for pharmacological classes based on their source"""
    class_id = urllib.parse.quote(class_id)
    if class_source == 'CHEBI':
        return 'http://identifiers.org/chebi/{}'.format(class_id)
    if class_source == 'MeSH':
        return 'http://identifiers.org/mesh/{}'.format(class_id)
    if class_source == 'FDA':
        #return 'https://rxnav.nlm.nih.gov/REST/Ndfrt/id?idType=NUI&idString={}'.format(class_id)
        # Use bioportal link until something better arises
        return 'http://purl.bioontology.org/ontology/NDFRT/{}'.format(class_id)

classes_df['class_type'] = classes_df.TYPE.map(class_type_map)
del classes_df['TYPE']
classes_df = classes_df.sort_values(['class_type', 'CLASS_SOURCE_ID'])
classes_df = classes_df.rename(columns={'CLASS_SOURCE_ID': 'class_id', 'CLASS': 'class_name', 'SOURCE': 'class_source'})
classes_df['url'] = classes_df.apply(lambda x: get_class_url(x.class_source, x.class_id), axis='columns')
classes_df.head(2)

Unnamed: 0,class_id,class_name,class_source,class_type,url
73,CHEBI:21241,vitamin C,CHEBI,Application,http://identifiers.org/chebi/CHEBI%3A21241
4385,CHEBI:22153,acaricide,CHEBI,Application,http://identifiers.org/chebi/CHEBI%3A22153


In [21]:
class_df.to_csv('rephetio/drug-to-class.tsv', sep='\t', index=False)
classes_df.to_csv('rephetio/classes.tsv', sep='\t', index=False)