{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "import csv\n", "import re\n", "\n", "import networkx\n", "import pandas\n", "\n", "import do_tools" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "U download/HumanDO.obo\n", "Checked out revision 2816.\n" ] } ], "source": [ "! svn checkout svn://svn.code.sf.net/p/diseaseontology/code/trunk/ download" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "path = os.path.join('download', 'HumanDO.obo')\n", "do = do_tools.load_do(path)\n", "dox = do_tools.do_to_networkx(do)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1509DOID:0001816angiosarcomaA malignant vascular tumor that results_in rap...
\n", "
" ], "text/plain": [ " disease_id name \\\n", "1509 DOID:0001816 angiosarcoma \n", "3043 DOID:0002116 pterygium \n", "\n", " description \n", "1509 A malignant vascular tumor that results_in rap... \n", "3043 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create a table of descriptions\n", "pattern = re.compile(r'^\"(.*?)\"')\n", "rows = list()\n", "for term in dox:\n", " match = pattern.search(term.definition)\n", " description = match.group(1) if match else ''\n", " rows.append((term.id, term.name, description))\n", "description_df = pandas.DataFrame(rows, columns = ['disease_id', 'name', 'description']).sort_values('disease_id')\n", "description_df.to_csv('data/description.tsv', sep='\\t', index=False)\n", "description_df.head(2)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "xref_rename = {\n", " 'ICD10CM': 'ICD10',\n", " 'ICD9CM': 'ICD9',\n", " 'NCI2009_04D': 'NCI',\n", " 'SNOMEDCT_2010_1_31': 'SNOMEDCT',\n", " 'SNOMEDCT_2013_01_31': 'SNOMEDCT',\n", " 'UMLS_CUI': 'UMLS',\n", "}" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def write_xref_row(writer, doid_code, doid_name, xrefs, rename_dict):\n", " rows = list()\n", " for xref in xrefs:\n", " resource, resource_id = xref.split(':', 1)\n", " if resource in rename_dict:\n", " resource = rename_dict[resource]\n", " rows.append([doid_code, doid_name, resource, resource_id])\n", " rows.sort()\n", " writer.writerows(rows)\n", "\n", "file_unprop = open(os.path.join('data', 'xrefs.tsv'), 'w')\n", "file_prop = open(os.path.join('data', 'xrefs-prop.tsv'), 'w')\n", "\n", "writer_unprop = csv.writer(file_unprop, delimiter='\\t')\n", "writer_prop = csv.writer(file_prop, delimiter='\\t')\n", "\n", "for writer in writer_unprop, writer_prop:\n", " writer.writerow(['doid_code', 'doid_name', 'resource', 'resource_id'])\n", "\n", "for term in networkx.topological_sort_recursive(dox, reverse=True):\n", " xrefs = set(term.xrefs)\n", " xrefs_prop = set(xrefs)\n", " for ancestor in networkx.ancestors(dox, term):\n", " xrefs_prop |= set(ancestor.xrefs)\n", " \n", " write_xref_row(writer_unprop, term.id, term.name, xrefs, xref_rename)\n", " write_xref_row(writer_prop, term.id, term.name, xrefs_prop, xref_rename)\n", "\n", "for write_file in file_unprop, file_prop:\n", " write_file.close()\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'CSP',\n", " 'CTV3',\n", " 'EFO',\n", " 'EFOpat_id',\n", " 'HP',\n", " 'ICD10',\n", " 'ICD9',\n", " 'KEGG',\n", " 'MEDDRA',\n", " 'MSH',\n", " 'MTH',\n", " 'NCI',\n", " 'NDFRT',\n", " 'OMIM',\n", " 'ORDO',\n", " 'Orphanet',\n", " 'SNOMEDCT',\n", " 'UMLS',\n", " 'WHO'}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# list of xrefs\n", "import pandas\n", "path = os.path.join('data', 'xrefs.tsv')\n", "xref_df = pandas.read_table(path)\n", "set(xref_df.resource)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# create a name to term mapping\n", "rows = list()\n", "for term in dox:\n", " rows.append({'doid': term.id, 'name': term.name, 'type': 'name'})\n", " for synonym in term.synonyms:\n", " rows.append({'doid': term.id, 'name': synonym[0], 'type': '{}-synonym'.format(synonym[1].lower())})\n", "path = os.path.join('data', 'term-names.tsv')\n", "with open(path, 'w') as write_file:\n", " writer = csv.DictWriter(write_file, delimiter='\\t', fieldnames=['doid', 'name', 'type'])\n", " writer.writeheader()\n", " writer.writerows(rows)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }