{
"cells": [
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"import pandas"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" disease_name | \n",
" gene_entrez | \n",
" gene_symbol | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Adenocarcinoma, Mucinous | \n",
" 10801 | \n",
" SEPT9 | \n",
"
\n",
" \n",
" 1 | \n",
" Adenocarcinoma, Mucinous | \n",
" 10164 | \n",
" CHST4 | \n",
"
\n",
" \n",
" 2 | \n",
" Adenocarcinoma, Mucinous | \n",
" 3860 | \n",
" KRT13 | \n",
"
\n",
" \n",
" 3 | \n",
" Hemorrhagic fevers, Viral | \n",
" 3383 | \n",
" ICAM1 | \n",
"
\n",
" \n",
" 4 | \n",
" Hemorrhagic fevers, Viral | \n",
" 3569 | \n",
" IL6 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" disease_name gene_entrez gene_symbol\n",
"0 Adenocarcinoma, Mucinous 10801 SEPT9\n",
"1 Adenocarcinoma, Mucinous 10164 CHST4\n",
"2 Adenocarcinoma, Mucinous 3860 KRT13\n",
"3 Hemorrhagic fevers, Viral 3383 ICAM1\n",
"4 Hemorrhagic fevers, Viral 3569 IL6"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# downloaded from http://django.nubic.northwestern.edu/fundo/media/data/do_lite.txt\n",
"dolite = pandas.read_csv('do_lite.txt', sep='\\t', names=['disease_name', 'gene_entrez', 'gene_symbol'])\n",
"dolite[:5]"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"561"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diseases = sorted(set(dolite['disease_name']))\n",
"len(diseases)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with open('dolite_terms.txt', 'w') as write_file:\n",
" write_file.write('\\n'.join(diseases))"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" doid | \n",
" name | \n",
" type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" DOID:3301 | \n",
" gonadoblastoma | \n",
" name | \n",
"
\n",
" \n",
" 1 | \n",
" DOID:3652 | \n",
" Leigh disease | \n",
" name | \n",
"
\n",
" \n",
" 2 | \n",
" DOID:3652 | \n",
" Infantile necrotizing encephalomyelopathy | \n",
" exact-synonym | \n",
"
\n",
" \n",
" 3 | \n",
" DOID:3652 | \n",
" juvenile subacute necrotizing encephalomyelopathy | \n",
" exact-synonym | \n",
"
\n",
" \n",
" 4 | \n",
" DOID:3652 | \n",
" Leigh syndrome | \n",
" exact-synonym | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" doid name type\n",
"0 DOID:3301 gonadoblastoma name\n",
"1 DOID:3652 Leigh disease name\n",
"2 DOID:3652 Infantile necrotizing encephalomyelopathy exact-synonym\n",
"3 DOID:3652 juvenile subacute necrotizing encephalomyelopathy exact-synonym\n",
"4 DOID:3652 Leigh syndrome exact-synonym"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = os.path.join('..', 'data', 'term-names.tsv')\n",
"donames = pandas.read_csv(path, sep='\\t')\n",
"donames[:5]"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" doid | \n",
" name_lower | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" DOID:3301 | \n",
" gonadoblastoma | \n",
"
\n",
" \n",
" 1 | \n",
" DOID:3652 | \n",
" leigh disease | \n",
"
\n",
" \n",
" 2 | \n",
" DOID:3652 | \n",
" infantile necrotizing encephalomyelopathy | \n",
"
\n",
" \n",
" 3 | \n",
" DOID:3652 | \n",
" juvenile subacute necrotizing encephalomyelopathy | \n",
"
\n",
" \n",
" 4 | \n",
" DOID:3652 | \n",
" leigh syndrome | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" doid name_lower\n",
"0 DOID:3301 gonadoblastoma\n",
"1 DOID:3652 leigh disease\n",
"2 DOID:3652 infantile necrotizing encephalomyelopathy\n",
"3 DOID:3652 juvenile subacute necrotizing encephalomyelopathy\n",
"4 DOID:3652 leigh syndrome"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"donames['name_lower'] = [x.lower() for x in donames.name]\n",
"doname_map = donames[['doid', 'name_lower']].drop_duplicates()\n",
"doname_map[:5]"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" dolite_name | \n",
" name_lower | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" AIDS | \n",
" aids | \n",
"
\n",
" \n",
" 1 | \n",
" Abortion | \n",
" abortion | \n",
"
\n",
" \n",
" 2 | \n",
" Abruption placentae | \n",
" abruption placentae | \n",
"
\n",
" \n",
" 3 | \n",
" Achalasia and cardiospasm | \n",
" achalasia and cardiospasm | \n",
"
\n",
" \n",
" 4 | \n",
" Acne | \n",
" acne | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" dolite_name name_lower\n",
"0 AIDS aids\n",
"1 Abortion abortion\n",
"2 Abruption placentae abruption placentae\n",
"3 Achalasia and cardiospasm achalasia and cardiospasm\n",
"4 Acne acne"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dolite_df = pandas.DataFrame(data = diseases, columns = ['dolite_name'])\n",
"dolite_df['name_lower'] = [x.lower() for x in dolite_df.dolite_name]\n",
"dolite_df[:5]"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" doid | \n",
" dolite_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" DOID:635 | \n",
" AIDS | \n",
"
\n",
" \n",
" 1 | \n",
" NaN | \n",
" Abortion | \n",
"
\n",
" \n",
" 2 | \n",
" NaN | \n",
" Abruption placentae | \n",
"
\n",
" \n",
" 3 | \n",
" NaN | \n",
" Achalasia and cardiospasm | \n",
"
\n",
" \n",
" 4 | \n",
" DOID:6543 | \n",
" Acne | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" doid dolite_name\n",
"0 DOID:635 AIDS\n",
"1 NaN Abortion\n",
"2 NaN Abruption placentae\n",
"3 NaN Achalasia and cardiospasm\n",
"4 DOID:6543 Acne"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mapping_df = dolite_df.merge(doname_map, how='left')\n",
"mapping_df = mapping_df[['doid', 'dolite_name']].drop_duplicates()\n",
"mapping_df.to_csv('dolite_to_doid.tsv', sep='\\t', index=False)\n",
"mapping_df[:5]"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"372"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# number of matches\n",
"sum(isinstance(x, str) for x in mapping_df.doid)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}