{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import csv\n",
    "import re\n",
    "\n",
    "import networkx\n",
    "import pandas\n",
    "\n",
    "import do_tools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "U    download/HumanDO.obo\n",
      "Checked out revision 2816.\n"
     ]
    }
   ],
   "source": [
    "! svn checkout svn://svn.code.sf.net/p/diseaseontology/code/trunk/ download"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "path = os.path.join('download', 'HumanDO.obo')\n",
    "do = do_tools.load_do(path)\n",
    "dox = do_tools.do_to_networkx(do)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>disease_id</th>\n",
       "      <th>name</th>\n",
       "      <th>description</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1509</th>\n",
       "      <td>DOID:0001816</td>\n",
       "      <td>angiosarcoma</td>\n",
       "      <td>A malignant vascular tumor that results_in rap...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3043</th>\n",
       "      <td>DOID:0002116</td>\n",
       "      <td>pterygium</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        disease_id          name  \\\n",
       "1509  DOID:0001816  angiosarcoma   \n",
       "3043  DOID:0002116     pterygium   \n",
       "\n",
       "                                            description  \n",
       "1509  A malignant vascular tumor that results_in rap...  \n",
       "3043                                                     "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create a table of descriptions\n",
    "pattern = re.compile(r'^\"(.*?)\"')\n",
    "rows = list()\n",
    "for term in dox:\n",
    "    match = pattern.search(term.definition)\n",
    "    description = match.group(1) if match else ''\n",
    "    rows.append((term.id, term.name, description))\n",
    "description_df = pandas.DataFrame(rows, columns = ['disease_id', 'name', 'description']).sort_values('disease_id')\n",
    "description_df.to_csv('data/description.tsv', sep='\\t', index=False)\n",
    "description_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "xref_rename = {\n",
    "    'ICD10CM': 'ICD10',\n",
    "    'ICD9CM': 'ICD9',\n",
    "    'NCI2009_04D': 'NCI',\n",
    "    'SNOMEDCT_2010_1_31': 'SNOMEDCT',\n",
    "    'SNOMEDCT_2013_01_31': 'SNOMEDCT',\n",
    "    'UMLS_CUI': 'UMLS',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def write_xref_row(writer, doid_code, doid_name, xrefs, rename_dict):\n",
    "    rows = list()\n",
    "    for xref in xrefs:\n",
    "        resource, resource_id = xref.split(':', 1)\n",
    "        if resource in rename_dict:\n",
    "            resource = rename_dict[resource]\n",
    "        rows.append([doid_code, doid_name, resource, resource_id])\n",
    "    rows.sort()\n",
    "    writer.writerows(rows)\n",
    "\n",
    "file_unprop = open(os.path.join('data', 'xrefs.tsv'), 'w')\n",
    "file_prop = open(os.path.join('data', 'xrefs-prop.tsv'), 'w')\n",
    "\n",
    "writer_unprop = csv.writer(file_unprop, delimiter='\\t')\n",
    "writer_prop = csv.writer(file_prop, delimiter='\\t')\n",
    "\n",
    "for writer in writer_unprop, writer_prop:\n",
    "    writer.writerow(['doid_code', 'doid_name', 'resource', 'resource_id'])\n",
    "\n",
    "for term in networkx.topological_sort_recursive(dox, reverse=True):\n",
    "    xrefs = set(term.xrefs)\n",
    "    xrefs_prop = set(xrefs)\n",
    "    for ancestor in networkx.ancestors(dox, term):\n",
    "        xrefs_prop |= set(ancestor.xrefs)\n",
    "    \n",
    "    write_xref_row(writer_unprop, term.id, term.name, xrefs, xref_rename)\n",
    "    write_xref_row(writer_prop, term.id, term.name, xrefs_prop, xref_rename)\n",
    "\n",
    "for write_file in file_unprop, file_prop:\n",
    "    write_file.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'CSP',\n",
       " 'CTV3',\n",
       " 'EFO',\n",
       " 'EFOpat_id',\n",
       " 'HP',\n",
       " 'ICD10',\n",
       " 'ICD9',\n",
       " 'KEGG',\n",
       " 'MEDDRA',\n",
       " 'MSH',\n",
       " 'MTH',\n",
       " 'NCI',\n",
       " 'NDFRT',\n",
       " 'OMIM',\n",
       " 'ORDO',\n",
       " 'Orphanet',\n",
       " 'SNOMEDCT',\n",
       " 'UMLS',\n",
       " 'WHO'}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# list of xrefs\n",
    "import pandas\n",
    "path = os.path.join('data', 'xrefs.tsv')\n",
    "xref_df = pandas.read_table(path)\n",
    "set(xref_df.resource)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# create a name to term mapping\n",
    "rows = list()\n",
    "for term in dox:\n",
    "    rows.append({'doid': term.id, 'name': term.name, 'type': 'name'})\n",
    "    for synonym in term.synonyms:\n",
    "        rows.append({'doid': term.id, 'name': synonym[0], 'type': '{}-synonym'.format(synonym[1].lower())})\n",
    "path = os.path.join('data', 'term-names.tsv')\n",
    "with open(path, 'w') as write_file:\n",
    "    writer = csv.DictWriter(write_file, delimiter='\\t', fieldnames=['doid', 'name', 'type'])\n",
    "    writer.writeheader()\n",
    "    writer.writerows(rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}