{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "import gzip\n", "import io\n", "import json\n", "\n", "import xml.etree.ElementTree as ET\n", "\n", "import networkx\n", "import pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#!wget --timestamping --directory-prefix download/ ftp://nlmpubs.nlm.nih.gov/online/mesh/.xmlmesh/desc2015.gz" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read MeSH xml release\n", "xml_path = os.path.join('download', 'desc2015.gz')\n", "with gzip.open(xml_path) as xml_file:\n", " tree = ET.parse(xml_file)\n", "root = tree.getroot()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Extract mesh terms\n", "term_dicts = list()\n", "for descriptor in root:\n", " for concept in descriptor.findall('ConceptList/Concept'):\n", " for term in concept.findall('TermList/Term'):\n", " term_dict = {\n", " 'DescriptorUI': descriptor.findtext('DescriptorUI'),\n", " 'ConceptUI': concept.findtext('ConceptUI'),\n", " 'TermUI': term.findtext('TermUI'),\n", " 'TermName': term.findtext('String')\n", " }\n", " term_dict.update(concept.attrib)\n", " term_dict.update(term.attrib)\n", " term_dicts.append(term_dict)\n", "\n", "columns = ['DescriptorUI', 'ConceptUI', 'PreferredConceptYN', 'TermUI', 'TermName',\n", " 'ConceptPreferredTermYN', 'IsPermutedTermYN', 'LexicalTag', 'PrintFlagYN', 'RecordPreferredTermYN']\n", "term_df = pandas.DataFrame(term_dicts)[columns]\n", "term_df.to_csv('data/descriptor-terms.tsv', index=False, sep='\\t')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Test whether MeSH term names are unique\n", "len(term_df) == len(set(term_df.TermName))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Parse MeSH xml release\n", "terms = list()\n", "\n", "for elem in root:\n", " term = dict()\n", " term['mesh_id'] = elem.findtext('DescriptorUI')\n", " term['mesh_name'] = elem.findtext('DescriptorName/String')\n", " term['semantic_types'] = list({x.text for x in elem.findall(\n", " 'ConceptList/Concept/SemanticTypeList/SemanticType/SemanticTypeUI')})\n", " term['tree_numbers'] = [x.text for x in elem.findall('TreeNumberList/TreeNumber')]\n", " terms.append(term)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Determine ontology parents\n", "tree_number_to_id = {tn: term['mesh_id'] for term in terms for tn in term['tree_numbers']}\n", "\n", "for term in terms:\n", " parents = set()\n", " for tree_number in term['tree_numbers']:\n", " try:\n", " parent_tn, self_tn = tree_number.rsplit('.', 1)\n", " parents.add(tree_number_to_id[parent_tn])\n", " except ValueError:\n", " pass\n", " term['parents'] = list(parents)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "path = os.path.join('data', 'mesh.json')\n", "with open(path, 'w') as write_file:\n", " json.dump(terms, write_file, indent=2)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Create a newtorkx directed graph represented mesh\n", "network = networkx.DiGraph()\n", "\n", "# add nodes\n", "for term in terms:\n", " network.add_node(term['mesh_id'], name=term['mesh_name'])\n", "\n", "# add edges\n", "for term in terms:\n", " for parent in term['parents']:\n", " network.add_edge(parent, term['mesh_id'])\n", "\n", "assert networkx.is_directed_acyclic_graph(network)\n", "\n", "networkx.write_gexf(network, 'data/ontology.gexf.gz')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read mesh\n", "path = os.path.join('data', 'mesh.json')\n", "with open(path) as read_file:\n", " mesh = json.load(read_file)\n", "\n", "mesh_df = pandas.DataFrame.from_dict(mesh)[['mesh_id', 'mesh_name']]\n", "mesh_df.to_csv('data/terms.tsv', sep='\\t', index=False)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Extract (mesh_id, mesh_tree_number) pairs\n", "rows = []\n", "for term in mesh:\n", " mesh_id = term['mesh_id']\n", " mesh_name = term['mesh_name']\n", " for tree_number in term['tree_numbers']:\n", " rows.append([mesh_id, mesh_name, tree_number])\n", "\n", "tn_df = pandas.DataFrame(rows, columns=['mesh_id', 'mesh_name', 'mesh_tree_number'])\n", "tn_df.to_csv('data/tree-numbers.tsv', sep='\\t', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Diseases" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def is_human_disease(tn):\n", " \"\"\"Given a tree number, return whether the heirarchical path suggests a human disease.\"\"\"\n", " # F03 (mental disorders)\n", " if tn.startswith('F03'):\n", " return True\n", " # C01 though C21 and C24 -- C26\n", " for i in list(range(1, 22)) + ['C24', 'C25', 'C26']:\n", " if tn.startswith('C' + str(i).zfill(2)):\n", " return True\n", " # C23 exlcuding C23.888 (Symptoms and Signs)\n", " if tn.startswith('C23') and not tn.startswith('C23.888'):\n", " return True\n", " return False" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "4501" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diseases = {term['mesh_id'] for term in terms if any(map(is_human_disease, term['tree_numbers']))}\n", "len(diseases)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Symptoms" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read HSDN symptoms\n", "url = 'https://raw.githubusercontent.com/LABrueggs/HSDN/master/Symptom-Occurence-Output.tsv'\n", "hsdn_symptom_df = pandas.read_table(url, index_col=0)\n", "hsdn_symptoms = hsdn_symptom_df['MeSH Symptom ID']" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "319" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# find MeSH symptoms\n", "symptoms = networkx.descendants(network, 'D012816') # signs and symptoms\n", "symptom_df = mesh_df[mesh_df.mesh_id.isin(symptoms)]\n", "pandas.options.mode.chained_assignment = None\n", "symptom_df['in_hsdn'] = symptom_df.mesh_id.isin(hsdn_symptoms).astype(int)\n", "symptom_df.to_csv('data/symptoms.tsv', index=False, sep='\\t')\n", "sum(symptom_df.in_hsdn)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Side Effects" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "side_effects = networkx.descendants(network, 'D064420') # Drug-Related Side Effects and Adverse Reactions\n", "side_effect_df = mesh_df[mesh_df.mesh_id.isin(side_effects)]\n", "len(side_effect_df)" ] }, { "cell_type": "code", "execution_count": 93, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 }