library(dplyr)
library(DT)

options(stringsAsFactors = FALSE)

Read input for compound mapping

# read the file of approved, SM, structured drugbank compounds
drugbank.df <- 'http://git.dhimmel.com/drugbank/data/drugbank-slim.tsv' %>%
  read.delim(stringsAsFactors=FALSE, na.strings='') %>%
  dplyr::transmute(drugbank_id, drugbank_name = name)

# construct a rxnorm to drugbank mapping (through FDA-SRS UNII)
rxnorm.df <- dplyr::inner_join(
  file.path('data', 'rxcui-unii-map.tsv') %>%
    read.delim(),
  'http://git.dhimmel.com/drugbank/data/mapping/fdasrs.tsv' %>%
    read.delim()
)
## Joining by: "fdasrs_id"

Read input for disease mapping

# read the file with DO slim terms
do.df <- 'http://git.dhimmel.com/disease-ontology/data/slim-terms.tsv' %>%
  read.delim() %>%
  dplyr::transmute(doid_code = doid, doid_name = name)

# read disease ontology slim term propagations
doslim.df <- 'http://git.dhimmel.com/disease-ontology/data/slim-terms-prop.tsv' %>%
  read.delim()

# read disease ontology slim mappings
domap.df <- 
  'http://git.dhimmel.com/disease-ontology/data/xrefs-prop-slim.tsv' %>%
   read.delim()

# extract the DO to UMLS mapping
umls.df <- domap.df %>%
  dplyr::filter(resource == 'UMLS') %>%
  dplyr::select(-resource) %>%
  dplyr::rename(disease_cui = resource_id)

# extract the DO to ICD9 mapping
icd9.df <- domap.df %>%
  dplyr::filter(resource == 'ICD9') %>%
  dplyr::select(-resource) %>%
  dplyr::rename(disease_icd9 = resource_id)

# extract the DO to OMIM mapping
omim.df <- domap.df %>%
  dplyr::filter(resource == 'OMIM') %>%
  dplyr::select(-resource) %>%
  dplyr::rename(disease_omim = resource_id) %>%
  dplyr::mutate(disease_omim = as.integer(disease_omim))

Read and map labeledin

labin.df <-
  # read labeledin data
  file.path('labeledin', 'data', 'indications.tsv') %>%
  read.delim() %>%
  # remove combo drugs
  dplyr::mutate(rxnorm_id = as.integer(rxnorm_id)) %>%
  dplyr::filter(! is.na(rxnorm_id)) %>%
  # map umls diseases to DO
  dplyr::inner_join(umls.df) %>%
  # map rxnorm compounds to drugbank
  dplyr::inner_join(rxnorm.df)
## Warning in mutate_impl(.data, dots): NAs introduced by coercion
## Joining by: "disease_cui"
## Joining by: "rxnorm_id"

Read and map MEDI

medi.df <- 
  file.path('medi', 'data', 'medi-umls.tsv') %>%
  read.delim() %>%
  dplyr::inner_join(rxnorm.df)
## Joining by: "rxnorm_id"
medi.df <- dplyr::bind_rows(
  umls.df %>%
    dplyr::inner_join(medi.df),
  icd9.df %>%
    dplyr::inner_join(medi.df)
)
## Joining by: "disease_cui"
## Joining by: "disease_icd9"

Read and map PREDICT

predict.df <- 
  file.path('msb-predict', 'data', 'indications-umls.tsv') %>%
  read.delim() %>%
  dplyr::rename(disease_cui = umls_cui, disease_omim = omim_id)

predict.df <- dplyr::bind_rows(
  umls.df %>%
    dplyr::inner_join(predict.df),
  omim.df %>%
    dplyr::inner_join(predict.df)
)
## Joining by: "disease_cui"
## Joining by: "disease_omim"

Join resources

indication.df <- dplyr::bind_rows(
  # LabeledIn
  labin.df %>%
    dplyr::select(doid_code, drugbank_id) %>%
    dplyr::distinct() %>%
    dplyr::mutate(resource = 'labeledin'),
  # MEDI
  medi.df %>%
    dplyr::group_by(doid_code, drugbank_id) %>%
    dplyr::summarize(
      resource = ifelse(max(hps), 'medi_hps', 'medi_lps')
    ) %>%
    dplyr::ungroup(),
  # PREDICT
  predict.df %>%
    dplyr::select(doid_code, drugbank_id) %>%
    dplyr::distinct() %>%
    dplyr::mutate(resource = 'predict'),
  # ehrlink
  ehrlink.df %>%
    dplyr::select(doid_code, drugbank_id) %>%
    dplyr::distinct() %>%
    dplyr::mutate(resource = 'ehrlink')
)

# add compound and disease names
indication.df <- indication.df %>%
  dplyr::inner_join(drugbank.df) %>%
  dplyr::left_join(do.df)
## Joining by: "drugbank_id"
## Joining by: "doid_code"
# save sourced indications
path <- file.path('data', 'indications-with-source.tsv')
write.table(indication.df, path, sep='\t', row.names=FALSE, quote=FALSE)

indication.df %>% DT::datatable()

Convert to a single row per compound-disease pair

hc.sources <- c('medi_hps', 'ehrlink', 'labeledin', 'predict')
lc.sources <- c('medi_lps')

indication_no_source.df <- indication.df %>%
  dplyr::group_by(doid_code, drugbank_id, doid_name, drugbank_name) %>%
  dplyr::summarize(
    n_hc_resources = length(intersect(resource, hc.sources)),
    n_lc_resources = length(intersect(resource, lc.sources))
    ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(confidence = ifelse(n_hc_resources > 0, 'high', 'low'))

path <- file.path('data', 'indications.tsv')
write.table(indication_no_source.df, path, sep='\t', row.names=FALSE, quote=FALSE)

indication_no_source.df %>% DT::datatable()

Indications per disease

indication_no_source.df %>%
  dplyr::group_by(doid_code, doid_name) %>%
  dplyr::summarize(
    'n_hc' = sum(confidence == 'high'),
    'n_lc' = sum(confidence == 'low')) %>%
  DT::datatable()

Indications per compound

indication_no_source.df %>%
  dplyr::group_by(drugbank_id, drugbank_name) %>%
  dplyr::summarize(
    'n_hc' = sum(confidence == 'high'),
    'n_lc' = sum(confidence == 'low')) %>%
  DT::datatable()