{ "cells": [ { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "import pandas" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
disease_namegene_entrezgene_symbol
0 Adenocarcinoma, Mucinous 10801 SEPT9
1 Adenocarcinoma, Mucinous 10164 CHST4
2 Adenocarcinoma, Mucinous 3860 KRT13
3 Hemorrhagic fevers, Viral 3383 ICAM1
4 Hemorrhagic fevers, Viral 3569 IL6
\n", "
" ], "text/plain": [ " disease_name gene_entrez gene_symbol\n", "0 Adenocarcinoma, Mucinous 10801 SEPT9\n", "1 Adenocarcinoma, Mucinous 10164 CHST4\n", "2 Adenocarcinoma, Mucinous 3860 KRT13\n", "3 Hemorrhagic fevers, Viral 3383 ICAM1\n", "4 Hemorrhagic fevers, Viral 3569 IL6" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# downloaded from http://django.nubic.northwestern.edu/fundo/media/data/do_lite.txt\n", "dolite = pandas.read_csv('do_lite.txt', sep='\\t', names=['disease_name', 'gene_entrez', 'gene_symbol'])\n", "dolite[:5]" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "561" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diseases = sorted(set(dolite['disease_name']))\n", "len(diseases)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [], "source": [ "with open('dolite_terms.txt', 'w') as write_file:\n", " write_file.write('\\n'.join(diseases))" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
doidnametype
0 DOID:3301 gonadoblastoma name
1 DOID:3652 Leigh disease name
2 DOID:3652 Infantile necrotizing encephalomyelopathy exact-synonym
3 DOID:3652 juvenile subacute necrotizing encephalomyelopathy exact-synonym
4 DOID:3652 Leigh syndrome exact-synonym
\n", "
" ], "text/plain": [ " doid name type\n", "0 DOID:3301 gonadoblastoma name\n", "1 DOID:3652 Leigh disease name\n", "2 DOID:3652 Infantile necrotizing encephalomyelopathy exact-synonym\n", "3 DOID:3652 juvenile subacute necrotizing encephalomyelopathy exact-synonym\n", "4 DOID:3652 Leigh syndrome exact-synonym" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "path = os.path.join('..', 'data', 'term-names.tsv')\n", "donames = pandas.read_csv(path, sep='\\t')\n", "donames[:5]" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
doidname_lower
0 DOID:3301 gonadoblastoma
1 DOID:3652 leigh disease
2 DOID:3652 infantile necrotizing encephalomyelopathy
3 DOID:3652 juvenile subacute necrotizing encephalomyelopathy
4 DOID:3652 leigh syndrome
\n", "
" ], "text/plain": [ " doid name_lower\n", "0 DOID:3301 gonadoblastoma\n", "1 DOID:3652 leigh disease\n", "2 DOID:3652 infantile necrotizing encephalomyelopathy\n", "3 DOID:3652 juvenile subacute necrotizing encephalomyelopathy\n", "4 DOID:3652 leigh syndrome" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "donames['name_lower'] = [x.lower() for x in donames.name]\n", "doname_map = donames[['doid', 'name_lower']].drop_duplicates()\n", "doname_map[:5]" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dolite_namename_lower
0 AIDS aids
1 Abortion abortion
2 Abruption placentae abruption placentae
3 Achalasia and cardiospasm achalasia and cardiospasm
4 Acne acne
\n", "
" ], "text/plain": [ " dolite_name name_lower\n", "0 AIDS aids\n", "1 Abortion abortion\n", "2 Abruption placentae abruption placentae\n", "3 Achalasia and cardiospasm achalasia and cardiospasm\n", "4 Acne acne" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dolite_df = pandas.DataFrame(data = diseases, columns = ['dolite_name'])\n", "dolite_df['name_lower'] = [x.lower() for x in dolite_df.dolite_name]\n", "dolite_df[:5]" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
doiddolite_name
0 DOID:635 AIDS
1 NaN Abortion
2 NaN Abruption placentae
3 NaN Achalasia and cardiospasm
4 DOID:6543 Acne
\n", "
" ], "text/plain": [ " doid dolite_name\n", "0 DOID:635 AIDS\n", "1 NaN Abortion\n", "2 NaN Abruption placentae\n", "3 NaN Achalasia and cardiospasm\n", "4 DOID:6543 Acne" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mapping_df = dolite_df.merge(doname_map, how='left')\n", "mapping_df = mapping_df[['doid', 'dolite_name']].drop_duplicates()\n", "mapping_df.to_csv('dolite_to_doid.tsv', sep='\\t', index=False)\n", "mapping_df[:5]" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "372" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# number of matches\n", "sum(isinstance(x, str) for x in mapping_df.doid)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.0" } }, "nbformat": 4, "nbformat_minor": 0 }