## Parsing rare list

https://globalgenes.org/rarelist


In [3]:
# Fetch HTML using requests lib and feed to bs4
import requests

# note their SSL certificate is not verified. Be careful!
result = requests.get("https://globalgenes.org/rarelist", verify=False)

from bs4 import BeautifulSoup
from bs4 import NavigableString
soup = BeautifulSoup(result.content, 'html.parser')





In [4]:
# check
soup.title

<title>Rare Disease List</title>

In [5]:
# write formatted html to file
# (not used: this is just a useful side effect for exploration)
f=open('rarelist.html','w')
f.write(soup.prettify())
f.close()

In [6]:
# use bs4 to extract names from HTML

names = []  ## all disease names found
name2url = {}  ## mapping of names to URLs

h5s = soup.find_all("h5")
for h5 in h5s:
    ul = h5.find_next_sibling('ul')
    for li in ul.findAll('li'):
        if len(li.contents) == 0:
            continue
        n = li.contents[0]
        if n is None:
            print('BAD: {}'.format(li))
            continue
        if not isinstance(n, NavigableString):
            n = n.contents[0]
            if li.select('a'):
                url = li.a['href']
            
                name2url[n] = url
        names.append(n)
        
# show the first 20 for sanity checking
names[0:20]

['Aagenaes syndrome',
 'Aarskog syndrome',
 'Aase Smith syndrome',
 'ABCD syndrome',
 'Abderhalden Kaufmann Lignac syndrome',
 'Abdominal aortic aneurysm',
 'Abdominal chemodectomas with cutaneous angiolipomas',
 'Abdominal cystic lymphangioma',
 'Abdominal obesity metabolic syndrome',
 'Aberrant subclavian artery',
 'Abetalipoproteinemia',
 'Abidi X-linked mental retardation syndrome',
 'Ablepharon macrostomia syndrome',
 "Abrikosov's tumor",
 'Abruzzo Erickson syndrome',
 'Absence of fingerprints congenital milia',
 'Absence of gluteal muscle',
 'Absence of septum pellucidum',
 'Absence of Tibia',
 'Absence of tibia with polydactyly']

In [7]:
## sanity check URL mapping
list(name2url.items())[0:10]

[('Acute disseminated encephalomyelitis', 'http://ulf.org/'),
 ('Acute hemorrhagic leukoencephalitis', 'http://ulf.org/'),
 ('Adrenoleukodystrophy X-linked', 'http://ulf.org/'),
 ('Adrenomyeloneuropathy', 'http://ulf.org/'),
 ('Aicardi-Goutieres syndrome', 'http://ulf.org/'),
 ('Alexander disease', 'http://ulf.org/'),
 ('Alkaptonuria', 'http://www.alkaptonuria.info/'),
 ('Alpers syndrome',
  'http://www.umdf.org/site/c.8qKOJ0MvF7LUG/b.7929671/k.BDF0/Home.htm'),
 ('Alzheimer disease familial', 'http://www.mitoaction.org/'),
 ('Alzheimer disease type 1', 'http://www.mitoaction.org/')]

In [8]:
import csv
with open('rare-list.tsv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter='\t')
    for n in names:
        spamwriter.writerow([n, name2url.get(n)])

In [9]:
## use ontobio lib for fetching ontologies and lexical mapping
from ontobio import OntologyFactory



In [10]:
ofa = OntologyFactory()

In [11]:
hp = ofa.create('obo:hp')

In [12]:
mondo = ofa.create('obo:mondo')

In [13]:
from ontobio.lexmap import LexicalMapEngine
lexmap = LexicalMapEngine()


In [14]:
# Quick hack to make a degenerate 'ontology' from the list of names
from ontobio import Ontology

def ont_from_names(names):
    ont = Ontology(id='rare')
    for n in names:
        ## use name as ID
        ont.add_node(n, n)
    return ont
        
rare = ont_from_names(names)
rare

rare handle: None meta: None

In [15]:
## quick inspection
rare.nodes()[0:10]

['Aagenaes syndrome',
 'Aarskog syndrome',
 'Aase Smith syndrome',
 'ABCD syndrome',
 'Abderhalden Kaufmann Lignac syndrome',
 'Abdominal aortic aneurysm',
 'Abdominal chemodectomas with cutaneous angiolipomas',
 'Abdominal cystic lymphangioma',
 'Abdominal obesity metabolic syndrome',
 'Aberrant subclavian artery']

In [16]:
## index the 3 ontologies
lexmap.index_ontology(hp)
lexmap.index_ontology(mondo)
lexmap.index_ontology(rare)



In [17]:
## CONFIGURE
## we will map R to mondo and hp separately
lexmap.ontology_pairs = [(rare.id, mondo.id), (rare.id, hp.id)]

In [18]:
# align
g = lexmap.get_xref_graph()

In [19]:
# get a dataframe from the mapping graph
df=lexmap.as_dataframe(g)
df

Unnamed: 0,left,left_label,right,right_label,left_match_type,right_match_type,left_match_val,right_match_val,score,left_simscore,...,conditional_pr_equiv,pr_subClassOf,pr_superClassOf,pr_equivalentTo,pr_other,left_novel,right_novel,left_consistent,right_consistent,equiv_clique_size
3287,11-beta-hydroxylase deficiency,11-beta-hydroxylase deficiency,MONDO:0008729,congenital adrenal hyperplasia due to 11-beta-...,label,hasRelatedSynonym,11-beta-hydroxylase deficiency,11-Beta-Hydroxylase Deficiency,50.0,1.000000,...,1.000000,0.061581,0.061581,0.799654,0.077184,True,True,False,False,7
2199,15q13.3 microdeletion syndrome,15q13.3 microdeletion syndrome,MONDO:0012774,chromosome 15q13.3 microdeletion syndrome,label,hasExactSynonym,15q13.3 microdeletion syndrome,15q13.3 microdeletion syndrome,90.0,1.000000,...,1.000000,0.029969,0.029969,0.918763,0.021299,True,True,False,False,6
3339,17-alpha-hydroxylase deficiency,17-alpha-hydroxylase deficiency,MONDO:0008730,congenital adrenal hyperplasia due to 17-alpha...,label,hasRelatedSynonym,17-alpha-hydroxylase deficiency,17-Alpha-Hydroxylase Deficiency,50.0,1.000000,...,1.000000,0.061581,0.061581,0.799654,0.077184,True,True,False,False,5
3481,17-beta hydroxysteroid dehydrogenase 3 deficiency,17-beta hydroxysteroid dehydrogenase 3 deficiency,MONDO:0009916,"46,XY disorder of sex development due to 17-be...",label,hasExactSynonym,17-beta hydroxysteroid dehydrogenase 3 deficiency,17-beta-hydroxysteroid dehydrogenase 3 deficiency,58.0,1.000000,...,1.000000,0.205965,0.205965,0.392394,0.195675,True,True,False,False,7
2592,17q21.31 microdeletion syndrome,17q21.31 microdeletion syndrome,MONDO:0012496,Koolen de Vries syndrome,label,hasExactSynonym,17q21.31 microdeletion syndrome,17q21.31 microdeletion syndrome,90.0,1.000000,...,0.473684,0.168017,0.055554,0.749591,0.026839,True,True,False,False,8
2593,17q21.31 microdeletion syndrome,17q21.31 microdeletion syndrome,MONDO:0018216,17q21.31 microdeletion syndrome,label,label,17q21.31 microdeletion syndrome,17q21.31 microdeletion syndrome,100.0,1.000000,...,0.526316,0.051671,0.108232,0.824734,0.015363,True,True,False,False,8
2987,18 Hydroxylase deficiency,18 Hydroxylase deficiency,MONDO:0008751,Corticosterone methyloxidase type 1 deficiency,label,hasRelatedSynonym,18 Hydroxylase deficiency,18-Hydroxylase Deficiency,32.0,1.000000,...,0.355556,0.232996,0.289482,0.283582,0.193941,True,True,False,False,6
2986,18 Hydroxylase deficiency,18 Hydroxylase deficiency,MONDO:0020489,familial hyperreninemic hypoaldosteronism type 1,label,hasExactSynonym,18 Hydroxylase deficiency,18-hydroxylase deficiency,58.0,1.000000,...,0.644444,0.292046,0.210145,0.309167,0.188643,True,True,False,False,6
1960,1q21.1 microdeletion syndrome,1q21.1 microdeletion syndrome,MONDO:0012914,chromosome 1q21.1 deletion syndrome,label,hasExactSynonym,1q21.1 microdeletion syndrome,1q21.1 microdeletion syndrome,90.0,1.000000,...,1.000000,0.030109,0.030109,0.923042,0.016740,True,True,False,False,6
1428,2 4-Dienoyl-CoA reductase deficiency,2 4-Dienoyl-CoA reductase deficiency,MONDO:0014464,progressive encephalopathy with leukodystrophy...,label,hasExactSynonym,2 4-Dienoyl-CoA reductase deficiency,"2,4-dienoyl-CoA reductase deficiency",58.0,1.000000,...,1.000000,0.200803,0.200803,0.382559,0.215835,True,True,False,False,5


In [20]:
## write to file (not used here but can be examined separately)
df.to_csv('rare-matches.tsv', sep="\t", index=False)

In [21]:
udf = lexmap.unmapped_dataframe(g)

In [22]:
## unmapped (TODO this includes unmapped from MONDO/HP to R, which we don't care about so much)
udf.to_csv('rare-no-matches.tsv', sep="\t", index=False)
udf

Unnamed: 0,id,label,mapped_equivs
18057,16p11.2 deletion syndrome,16p11.2 deletion syndrome,
105646,2-Methylacetoacetyl CoA thiolase deficiency,2-Methylacetoacetyl CoA thiolase deficiency,
41905,2-hydroxyethyl methacrylate sensitization,2-hydroxyethyl methacrylate sensitization,
29133,22q11.2 duplication syndrome,22q11.2 duplication syndrome,
100428,22q13.3 deletion syndrome,22q13.3 deletion syndrome,
96122,2q37 deletion syndrome,2q37 deletion syndrome,
88482,3 Methylcrotonyl-CoA carboxylase 1 deficiency,3 Methylcrotonyl-CoA carboxylase 1 deficiency,
34501,3 alpha methylcrotonyl-CoA carboxylase 2 defic...,3 alpha methylcrotonyl-CoA carboxylase 2 defic...,
85670,3-alpha hydroxyacyl-CoA dehydrogenase deficiency,3-alpha hydroxyacyl-CoA dehydrogenase deficiency,
77929,3p deletion syndrome,3p deletion syndrome,
