{ "cells": [ { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "import gzip\n", "import io\n", "import json\n", "\n", "import xml.etree.ElementTree as ET\n", "\n", "import networkx" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read MeSH xml release\n", "# ftp://nlmpubs.nlm.nih.gov/online/mesh/.xmlmesh/desc2015.gz\n", "xml_path = os.path.join('download', 'desc2015.gz')\n", "with gzip.open(xml_path) as xml_file:\n", " tree = ET.parse(xml_file)\n", "root = tree.getroot()" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "27455" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Parse MeSH xml release\n", "terms = list()\n", "\n", "for elem in root:\n", " term = dict()\n", " term['mesh_id'] = elem.findtext('DescriptorUI')\n", " term['mesh_name'] = elem.findtext('DescriptorName/String')\n", " term['semantic_types'] = list({x.text for x in elem.findall(\n", " 'ConceptList/Concept/SemanticTypeList/SemanticType/SemanticTypeUI')})\n", " term['tree_numbers'] = [x.text for x in elem.findall('TreeNumberList/TreeNumber')]\n", " terms.append(term)\n", "\n", "len(terms)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Determine ontology parents\n", "tree_number_to_id = {tn: term['mesh_id'] for term in terms for tn in term['tree_numbers']}\n", "\n", "for term in terms:\n", " parents = set()\n", " for tree_number in term['tree_numbers']:\n", " try:\n", " parent_tn, self_tn = tree_number.rsplit('.', 1)\n", " parents.add(tree_number_to_id[parent_tn])\n", " except ValueError:\n", " pass\n", " term['parents'] = list(parents)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [], "source": [ "path = os.path.join('data', 'mesh.json')\n", "with open(path, 'w') as write_file:\n", " json.dump(terms, write_file, indent=2)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Create a newtorkx directed graph represented mesh\n", "network = networkx.DiGraph()\n", "\n", "# add nodes\n", "for term in terms:\n", " network.add_node(term['mesh_id'], name=term['mesh_name'])\n", "\n", "# add edges\n", "for term in terms:\n", " for parent in term['parents']:\n", " network.add_edge(parent, term['mesh_id'])\n", "\n", "assert networkx.is_directed_acyclic_graph(network)\n", "\n", "networkx.write_gexf(network, 'data/ontology.gexf.gz')" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas\n", "\n", "# Read UMLS semantic types\n", "url = 'http://semanticnetwork.nlm.nih.gov/Download/RelationalFiles/SRDEF'\n", "sty_df = pandas.read_table(url, sep='|', header=None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 66, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Read mesh\n", "path = os.path.join('data', 'mesh.json')\n", "with open(path) as read_file:\n", " mesh = json.load(read_file)\n", "\n", "mesh_df = pandas.DataFrame.from_dict(mesh)[['mesh_id', 'mesh_name']]\n", "mesh_df.to_csv('data/terms.tsv', sep='\\t', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Symptoms" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "url = 'https://raw.githubusercontent.com/LABrueggs/HSDN/master/Symptom-Occurence-Output.tsv'\n", "hsdn_symptom_df = pandas.read_table(url, index_col=0)\n", "hsdn_symptoms = hsdn_symptom_df['MeSH Symptom ID']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "symptoms = networkx.descendants(network, 'D012816') # signs and symptoms\n", "symptom_df = mesh_df[mesh_df.mesh_id.isin(symptoms)]\n", "pandas.options.mode.chained_assignment = None\n", "symptom_df['in_hsdn'] = symptom_df.mesh_id.isin(hsdn_symptoms).astype(int)\n", "symptom_df.to_csv('data/symptoms.tsv', index=False, sep='\\t')\n", "sum(symptom_df.in_hsdn)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Side Effects\n", "side_effects = networkx.descendants(network, 'D064420') # Drug-Related Side Effects and Adverse Reactions\n", "side_effect_df = mesh_df[mesh_df.mesh_id.isin(side_effects)]\n", "len(side_effect_df)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.0" } }, "nbformat": 4, "nbformat_minor": 0 }