{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>disease_name</th>\n",
       "      <th>gene_entrez</th>\n",
       "      <th>gene_symbol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>  Adenocarcinoma, Mucinous</td>\n",
       "      <td> 10801</td>\n",
       "      <td> SEPT9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>  Adenocarcinoma, Mucinous</td>\n",
       "      <td> 10164</td>\n",
       "      <td> CHST4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>  Adenocarcinoma, Mucinous</td>\n",
       "      <td>  3860</td>\n",
       "      <td> KRT13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> Hemorrhagic fevers, Viral</td>\n",
       "      <td>  3383</td>\n",
       "      <td> ICAM1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> Hemorrhagic fevers, Viral</td>\n",
       "      <td>  3569</td>\n",
       "      <td>   IL6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                disease_name  gene_entrez gene_symbol\n",
       "0   Adenocarcinoma, Mucinous        10801       SEPT9\n",
       "1   Adenocarcinoma, Mucinous        10164       CHST4\n",
       "2   Adenocarcinoma, Mucinous         3860       KRT13\n",
       "3  Hemorrhagic fevers, Viral         3383       ICAM1\n",
       "4  Hemorrhagic fevers, Viral         3569         IL6"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# downloaded from http://django.nubic.northwestern.edu/fundo/media/data/do_lite.txt\n",
    "dolite = pandas.read_csv('do_lite.txt', sep='\\t', names=['disease_name', 'gene_entrez', 'gene_symbol'])\n",
    "dolite[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "561"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diseases = sorted(set(dolite['disease_name']))\n",
    "len(diseases)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with open('dolite_terms.txt', 'w') as write_file:\n",
    "    write_file.write('\\n'.join(diseases))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>doid</th>\n",
       "      <th>name</th>\n",
       "      <th>type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> DOID:3301</td>\n",
       "      <td>                                    gonadoblastoma</td>\n",
       "      <td>          name</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> DOID:3652</td>\n",
       "      <td>                                     Leigh disease</td>\n",
       "      <td>          name</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> DOID:3652</td>\n",
       "      <td>         Infantile necrotizing encephalomyelopathy</td>\n",
       "      <td> exact-synonym</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> DOID:3652</td>\n",
       "      <td> juvenile subacute necrotizing encephalomyelopathy</td>\n",
       "      <td> exact-synonym</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> DOID:3652</td>\n",
       "      <td>                                    Leigh syndrome</td>\n",
       "      <td> exact-synonym</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        doid                                               name           type\n",
       "0  DOID:3301                                     gonadoblastoma           name\n",
       "1  DOID:3652                                      Leigh disease           name\n",
       "2  DOID:3652          Infantile necrotizing encephalomyelopathy  exact-synonym\n",
       "3  DOID:3652  juvenile subacute necrotizing encephalomyelopathy  exact-synonym\n",
       "4  DOID:3652                                     Leigh syndrome  exact-synonym"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "path = os.path.join('..', 'data', 'term-names.tsv')\n",
    "donames = pandas.read_csv(path, sep='\\t')\n",
    "donames[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>doid</th>\n",
       "      <th>name_lower</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> DOID:3301</td>\n",
       "      <td>                                    gonadoblastoma</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> DOID:3652</td>\n",
       "      <td>                                     leigh disease</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> DOID:3652</td>\n",
       "      <td>         infantile necrotizing encephalomyelopathy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> DOID:3652</td>\n",
       "      <td> juvenile subacute necrotizing encephalomyelopathy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> DOID:3652</td>\n",
       "      <td>                                    leigh syndrome</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        doid                                         name_lower\n",
       "0  DOID:3301                                     gonadoblastoma\n",
       "1  DOID:3652                                      leigh disease\n",
       "2  DOID:3652          infantile necrotizing encephalomyelopathy\n",
       "3  DOID:3652  juvenile subacute necrotizing encephalomyelopathy\n",
       "4  DOID:3652                                     leigh syndrome"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "donames['name_lower'] = [x.lower() for x in donames.name]\n",
    "doname_map = donames[['doid', 'name_lower']].drop_duplicates()\n",
    "doname_map[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dolite_name</th>\n",
       "      <th>name_lower</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>                      AIDS</td>\n",
       "      <td>                      aids</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>                  Abortion</td>\n",
       "      <td>                  abortion</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>       Abruption placentae</td>\n",
       "      <td>       abruption placentae</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> Achalasia and cardiospasm</td>\n",
       "      <td> achalasia and cardiospasm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>                      Acne</td>\n",
       "      <td>                      acne</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 dolite_name                 name_lower\n",
       "0                       AIDS                       aids\n",
       "1                   Abortion                   abortion\n",
       "2        Abruption placentae        abruption placentae\n",
       "3  Achalasia and cardiospasm  achalasia and cardiospasm\n",
       "4                       Acne                       acne"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dolite_df = pandas.DataFrame(data = diseases, columns = ['dolite_name'])\n",
    "dolite_df['name_lower'] = [x.lower() for x in dolite_df.dolite_name]\n",
    "dolite_df[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>doid</th>\n",
       "      <th>dolite_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>  DOID:635</td>\n",
       "      <td>                      AIDS</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>       NaN</td>\n",
       "      <td>                  Abortion</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>       NaN</td>\n",
       "      <td>       Abruption placentae</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>       NaN</td>\n",
       "      <td> Achalasia and cardiospasm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> DOID:6543</td>\n",
       "      <td>                      Acne</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        doid                dolite_name\n",
       "0   DOID:635                       AIDS\n",
       "1        NaN                   Abortion\n",
       "2        NaN        Abruption placentae\n",
       "3        NaN  Achalasia and cardiospasm\n",
       "4  DOID:6543                       Acne"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapping_df = dolite_df.merge(doname_map, how='left')\n",
    "mapping_df = mapping_df[['doid', 'dolite_name']].drop_duplicates()\n",
    "mapping_df.to_csv('dolite_to_doid.tsv', sep='\\t', index=False)\n",
    "mapping_df[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "372"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# number of matches\n",
    "sum(isinstance(x, str) for x in mapping_df.doid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}