In [1]:
import os
import csv
import re

import networkx
import pandas

import do_tools

In [2]:
! svn checkout svn://svn.code.sf.net/p/diseaseontology/code/trunk/ download

U    download/HumanDO.obo
Checked out revision 2816.


In [2]:
path = os.path.join('download', 'HumanDO.obo')
do = do_tools.load_do(path)
dox = do_tools.do_to_networkx(do)

In [3]:
# Create a table of descriptions
pattern = re.compile(r'^"(.*?)"')
rows = list()
for term in dox:
    match = pattern.search(term.definition)
    description = match.group(1) if match else ''
    rows.append((term.id, term.name, description))
description_df = pandas.DataFrame(rows, columns = ['disease_id', 'name', 'description']).sort_values('disease_id')
description_df.to_csv('data/description.tsv', sep='\t', index=False)
description_df.head(2)

Unnamed: 0,disease_id,name,description
1509,DOID:0001816,angiosarcoma,A malignant vascular tumor that results_in rap...
3043,DOID:0002116,pterygium,


In [9]:
xref_rename = {
    'ICD10CM': 'ICD10',
    'ICD9CM': 'ICD9',
    'NCI2009_04D': 'NCI',
    'SNOMEDCT_2010_1_31': 'SNOMEDCT',
    'SNOMEDCT_2013_01_31': 'SNOMEDCT',
    'UMLS_CUI': 'UMLS',
}

In [10]:
def write_xref_row(writer, doid_code, doid_name, xrefs, rename_dict):
    rows = list()
    for xref in xrefs:
        resource, resource_id = xref.split(':', 1)
        if resource in rename_dict:
            resource = rename_dict[resource]
        rows.append([doid_code, doid_name, resource, resource_id])
    rows.sort()
    writer.writerows(rows)

file_unprop = open(os.path.join('data', 'xrefs.tsv'), 'w')
file_prop = open(os.path.join('data', 'xrefs-prop.tsv'), 'w')

writer_unprop = csv.writer(file_unprop, delimiter='\t')
writer_prop = csv.writer(file_prop, delimiter='\t')

for writer in writer_unprop, writer_prop:
    writer.writerow(['doid_code', 'doid_name', 'resource', 'resource_id'])

for term in networkx.topological_sort_recursive(dox, reverse=True):
    xrefs = set(term.xrefs)
    xrefs_prop = set(xrefs)
    for ancestor in networkx.ancestors(dox, term):
        xrefs_prop |= set(ancestor.xrefs)
    
    write_xref_row(writer_unprop, term.id, term.name, xrefs, xref_rename)
    write_xref_row(writer_prop, term.id, term.name, xrefs_prop, xref_rename)

for write_file in file_unprop, file_prop:
    write_file.close()


In [11]:
# list of xrefs
import pandas
path = os.path.join('data', 'xrefs.tsv')
xref_df = pandas.read_table(path)
set(xref_df.resource)

{'CSP',
 'CTV3',
 'EFO',
 'EFOpat_id',
 'HP',
 'ICD10',
 'ICD9',
 'KEGG',
 'MEDDRA',
 'MSH',
 'MTH',
 'NCI',
 'NDFRT',
 'OMIM',
 'ORDO',
 'Orphanet',
 'SNOMEDCT',
 'UMLS',
 'WHO'}

In [12]:
# create a name to term mapping
rows = list()
for term in dox:
    rows.append({'doid': term.id, 'name': term.name, 'type': 'name'})
    for synonym in term.synonyms:
        rows.append({'doid': term.id, 'name': synonym[0], 'type': '{}-synonym'.format(synonym[1].lower())})
path = os.path.join('data', 'term-names.tsv')
with open(path, 'w') as write_file:
    writer = csv.DictWriter(write_file, delimiter='\t', fieldnames=['doid', 'name', 'type'])
    writer.writeheader()
    writer.writerows(rows)