{
"cells": [
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import io\n",
"import functools\n",
"import itertools\n",
"import gzip\n",
"\n",
"import pandas\n",
"\n",
"import eutility"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" doid_code | \n",
" doid_name | \n",
" mesh_id | \n",
" mesh_name | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" DOID:2531 | \n",
" hematologic cancer | \n",
" D019337 | \n",
" Hematologic Neoplasms | \n",
"
\n",
" \n",
" 1 | \n",
" DOID:1319 | \n",
" brain cancer | \n",
" D001932 | \n",
" Brain Neoplasms | \n",
"
\n",
" \n",
" 2 | \n",
" DOID:263 | \n",
" kidney cancer | \n",
" D007680 | \n",
" Kidney Neoplasms | \n",
"
\n",
" \n",
" 3 | \n",
" DOID:1793 | \n",
" pancreatic cancer | \n",
" D010190 | \n",
" Pancreatic Neoplasms | \n",
"
\n",
" \n",
" 4 | \n",
" DOID:4159 | \n",
" skin cancer | \n",
" D012878 | \n",
" Skin Neoplasms | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" doid_code doid_name mesh_id mesh_name\n",
"0 DOID:2531 hematologic cancer D019337 Hematologic Neoplasms\n",
"1 DOID:1319 brain cancer D001932 Brain Neoplasms\n",
"2 DOID:263 kidney cancer D007680 Kidney Neoplasms\n",
"3 DOID:1793 pancreatic cancer D010190 Pancreatic Neoplasms\n",
"4 DOID:4159 skin cancer D012878 Skin Neoplasms"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read MeSH terms to MeSH names\n",
"url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/terms.tsv'\n",
"mesh_df = pandas.read_table(url)\n",
"\n",
"# Read MeSH terms mapped to DO Slim terms\n",
"url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/9fd75f14b17e01bebc97faf1bfa1b9025e9ce4de/data/xrefs-slim.tsv'\n",
"doslim_xref_df = pandas.read_table(url)\n",
"doslim_xref_df = doslim_xref_df[doslim_xref_df.resource == 'MSH'][['doid_code', 'doid_name', 'resource_id']].rename(columns={'resource_id': 'mesh_id'})\n",
"disease_df = doslim_xref_df.merge(mesh_df)\n",
"disease_df.to_csv('data/DO-slim-to-mesh.tsv', sep='\\t', index=False)\n",
"disease_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Diseases"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"7199 articles for Hematologic Neoplasms\n",
"98466 articles for Brain Neoplasms\n",
"48992 articles for Kidney Neoplasms\n",
"45514 articles for Pancreatic Neoplasms\n",
"84775 articles for Skin Neoplasms\n",
"82806 articles for Bone Neoplasms\n",
"53364 articles for Ovarian Neoplasms\n",
"186097 articles for Breast Neoplasms\n",
"48742 articles for Glioma\n",
"88124 articles for Uterine Neoplasms\n",
"18380 articles for Adrenal Gland Neoplasms\n",
"32981 articles for Esophageal Neoplasms\n",
"12012 articles for Salivary Gland Neoplasms\n",
"81939 articles for Prostatic Neoplasms\n",
"63628 articles for Stomach Neoplasms\n",
"37204 articles for Urinary Bladder Neoplasms\n",
"14652 articles for Peripheral Nervous System Neoplasms\n",
"96253 articles for Liver Neoplasms\n",
"3491 articles for Vaginal Neoplasms\n",
"207632 articles for Head and Neck Neoplasms\n",
"32426 articles for Rectal Neoplasms\n",
"28522 articles for Eye Neoplasms\n",
"50324 articles for Colonic Neoplasms\n",
"19300 articles for Laryngeal Neoplasms\n",
"222943 articles for Neoplasms, Germ Cell and Embryonal\n",
"7270 articles for Thymus Neoplasms\n",
"8485 articles for Myosarcoma\n",
"1981 articles for Appendiceal Neoplasms\n",
"3430 articles for Ureteral Neoplasms\n",
"5828 articles for Vulvar Neoplasms\n",
"1636 articles for Jejunal Neoplasms\n",
"2432 articles for Vascular Neoplasms\n",
"9690 articles for Mesothelioma\n",
"59405 articles for Melanoma\n",
"2017 articles for Fallopian Tube Neoplasms\n",
"18355 articles for Testicular Neoplasms\n",
"5867 articles for Gallbladder Neoplasms\n",
"15100 articles for Meningeal Neoplasms\n",
"10948 articles for Bile Duct Neoplasms\n",
"9539 articles for Mediastinal Neoplasms\n",
"7679 articles for Spinal Cord Neoplasms\n",
"6221 articles for Retroperitoneal Neoplasms\n",
"24654 articles for Crohn Disease\n",
"38980 articles for Multiple Sclerosis\n",
"71135 articles for Diabetes Mellitus, Type 2\n",
"21812 articles for Colitis, Ulcerative\n",
"50368 articles for Diabetes Mellitus, Type 1\n",
"75897 articles for Arthritis, Rheumatoid\n",
"32486 articles for Coronary Artery Disease\n",
"143223 articles for Coronary Disease\n",
"293061 articles for Myocardial Ischemia\n",
"103287 articles for Obesity\n",
"13615 articles for Celiac Disease\n",
"39378 articles for Lupus Erythematosus, Systemic\n",
"19549 articles for Refractive Errors\n",
"5256 articles for Liver Cirrhosis, Biliary\n",
"3445 articles for Vitiligo\n",
"13218 articles for Macular Degeneration\n",
"15999 articles for Metabolic Syndrome X\n",
"87932 articles for Asthma\n",
"68694 articles for Schizophrenia\n",
"17883 articles for Migraine Disorders\n",
"54563 articles for Alzheimer Disease\n",
"10904 articles for Graves Disease\n",
"39786 articles for Parkinson Disease\n",
"11640 articles for Dermatitis, Atopic\n",
"24216 articles for Bipolar Disorder\n",
"9297 articles for Spondylitis, Ankylosing\n",
"8748 articles for Polycystic Ovary Syndrome\n",
"154312 articles for Hypertension\n",
"13910 articles for Scleroderma, Systemic\n",
"6705 articles for Behcet Syndrome\n",
"3802 articles for Osteitis Deformans\n",
"18506 articles for Leprosy\n",
"18546 articles for Intracranial Aneurysm\n",
"35084 articles for Glaucoma\n",
"11318 articles for Amyotrophic Lateral Sclerosis\n",
"2261 articles for Restless Legs Syndrome\n",
"4264 articles for Mucocutaneous Lymph Node Syndrome\n",
"17451 articles for Atherosclerosis\n",
"2110 articles for Alopecia Areata\n",
"32190 articles for Osteoporosis\n",
"20180 articles for Hypothyroidism\n",
"4156 articles for Glomerulonephritis, IGA\n",
"4443 articles for Creutzfeldt-Jakob Syndrome\n",
"841 articles for Azoospermia\n",
"102081 articles for Epilepsy\n",
"36282 articles for Hepatitis B\n",
"30074 articles for Pulmonary Disease, Chronic Obstructive\n",
"12701 articles for Aortic Aneurysm, Abdominal\n",
"54351 articles for Kidney Failure, Chronic\n",
"2930 articles for Arthritis, Psoriatic\n",
"1895 articles for Glomerulonephritis, Membranous\n",
"5868 articles for Diabetes, Gestational\n",
"43052 articles for Malaria\n",
"13789 articles for Autistic Disorder\n",
"9994 articles for Cardiomyopathy, Dilated\n",
"716 articles for Arthritis, Gouty\n",
"14214 articles for Leiomyoma\n",
"2271 articles for Cholangitis, Sclerosing\n",
"2353 articles for Narcolepsy\n",
"9434 articles for Cleft Lip\n",
"1223 articles for Idiopathic Pulmonary Fibrosis\n",
"16577 articles for Attention Deficit Disorder with Hyperactivity\n",
"3113 articles for Tourette Syndrome\n",
"7767 articles for Aortic Aneurysm, Thoracic\n",
"62986 articles for Depressive Disorder\n",
"34975 articles for Pancreatitis\n",
"12096 articles for Nephrolithiasis\n",
"16153 articles for Periodontitis\n",
"5172 articles for Barrett Esophagus\n",
"535 articles for Fuchs' Endothelial Dystrophy\n",
"3861 articles for Otosclerosis\n",
"1469 articles for Conduct Disorder\n",
"2932 articles for Glomerulosclerosis, Focal Segmental\n",
"25723 articles for Dental Caries\n",
"104377 articles for Anemia\n",
"4587 articles for Panic Disorder\n",
"58288 articles for Acquired Immunodeficiency Syndrome\n"
]
}
],
"source": [
"rows_out = list()\n",
"\n",
"for i, row in disease_df.iterrows():\n",
" term_query = '{disease}[MeSH Major Topic]'.format(disease = row.mesh_name.lower())\n",
" payload = {'db': 'pubmed', 'term': term_query}\n",
" pmids = eutility.esearch_query(payload, retmax = 10000)\n",
" row['term_query'] = term_query\n",
" row['n_articles'] = len(pmids)\n",
" row['pubmed_ids'] = '|'.join(pmids)\n",
" rows_out.append(row)\n",
" print('{} articles for {}'.format(len(pmids), row.mesh_name))\n",
"\n",
"disease_pmids_df = pandas.DataFrame(rows_out)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with gzip.open('data/disease-pmids.tsv.gz', 'w') as write_file:\n",
" write_file = io.TextIOWrapper(write_file)\n",
" disease_pmids_df.to_csv(write_file, sep='\\t', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Symptoms"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" mesh_id | \n",
" mesh_name | \n",
" in_hsdn | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" D000006 | \n",
" Abdomen, Acute | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" D000270 | \n",
" Adie Syndrome | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" D000326 | \n",
" Adrenoleukodystrophy | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" D000334 | \n",
" Aerophagy | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" D000370 | \n",
" Ageusia | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" mesh_id mesh_name in_hsdn\n",
"0 D000006 Abdomen, Acute 1\n",
"1 D000270 Adie Syndrome 0\n",
"2 D000326 Adrenoleukodystrophy 0\n",
"3 D000334 Aerophagy 1\n",
"4 D000370 Ageusia 1"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read MeSH Symptoms\n",
"url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/symptoms.tsv'\n",
"symptom_df = pandas.read_table(url)\n",
"symptom_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"rows_out = list()\n",
"\n",
"for i, row in symptom_df.iterrows():\n",
" term_query = '{symptom}[MeSH Terms:noexp]'.format(symptom = row.mesh_name.lower())\n",
" payload = {'db': 'pubmed', 'term': term_query}\n",
" pmids = eutility.esearch_query(payload, retmax = 5000, sleep=2)\n",
" row['term_query'] = term_query\n",
" row['n_articles'] = len(pmids)\n",
" row['pubmed_ids'] = '|'.join(pmids)\n",
" rows_out.append(row)\n",
" print('{} articles for {}'.format(len(pmids), row.mesh_name))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" mesh_id | \n",
" mesh_name | \n",
" in_hsdn | \n",
" term_query | \n",
" n_articles | \n",
" pubmed_ids | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" D000006 | \n",
" Abdomen, Acute | \n",
" 1 | \n",
" abdomen, acute[MeSH Terms:noexp] | \n",
" 8465 | \n",
" 25669229|25650451|25619050|25608417|25543890|2... | \n",
"
\n",
" \n",
" 1 | \n",
" D000270 | \n",
" Adie Syndrome | \n",
" 0 | \n",
" adie syndrome[MeSH Terms:noexp] | \n",
" 311 | \n",
" 24995781|24625775|24215593|23952008|23809464|2... | \n",
"
\n",
" \n",
" 2 | \n",
" D000326 | \n",
" Adrenoleukodystrophy | \n",
" 0 | \n",
" adrenoleukodystrophy[MeSH Terms:noexp] | \n",
" 1506 | \n",
" 25583825|25378668|25297370|25275259|25149411|2... | \n",
"
\n",
" \n",
" 3 | \n",
" D000334 | \n",
" Aerophagy | \n",
" 1 | \n",
" aerophagy[MeSH Terms:noexp] | \n",
" 260 | \n",
" 25073665|24796405|23772202|23772201|23636521|2... | \n",
"
\n",
" \n",
" 4 | \n",
" D000370 | \n",
" Ageusia | \n",
" 1 | \n",
" ageusia[MeSH Terms:noexp] | \n",
" 220 | \n",
" 24825557|24782205|24191925|24137848|24088167|2... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" mesh_id mesh_name in_hsdn \\\n",
"0 D000006 Abdomen, Acute 1 \n",
"1 D000270 Adie Syndrome 0 \n",
"2 D000326 Adrenoleukodystrophy 0 \n",
"3 D000334 Aerophagy 1 \n",
"4 D000370 Ageusia 1 \n",
"\n",
" term_query n_articles \\\n",
"0 abdomen, acute[MeSH Terms:noexp] 8465 \n",
"1 adie syndrome[MeSH Terms:noexp] 311 \n",
"2 adrenoleukodystrophy[MeSH Terms:noexp] 1506 \n",
"3 aerophagy[MeSH Terms:noexp] 260 \n",
"4 ageusia[MeSH Terms:noexp] 220 \n",
"\n",
" pubmed_ids \n",
"0 25669229|25650451|25619050|25608417|25543890|2... \n",
"1 24995781|24625775|24215593|23952008|23809464|2... \n",
"2 25583825|25378668|25297370|25275259|25149411|2... \n",
"3 25073665|24796405|23772202|23772201|23636521|2... \n",
"4 24825557|24782205|24191925|24137848|24088167|2... "
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"symptom_pmids_df = pandas.DataFrame(rows_out)\n",
"\n",
"with gzip.open('data/symptom-pmids.tsv.gz', 'w') as write_file:\n",
" write_file = io.TextIOWrapper(write_file)\n",
" symptom_pmids_df.to_csv(write_file, sep='\\t', index=False)\n",
"\n",
"symptom_pmids_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def read_pmids_tsv(path, key, min_articles = 5):\n",
" term_to_pmids = dict()\n",
" pmids_df = pandas.read_table(path, compression='gzip')\n",
" pmids_df = pmids_df[pmids_df.n_articles >= min_articles]\n",
" for i, row in pmids_df.iterrows():\n",
" term = row[key]\n",
" pmids = row.pubmed_ids.split('|')\n",
" term_to_pmids[term] = set(pmids)\n",
" pmids_df.drop('pubmed_ids', axis=1, inplace=True)\n",
" return pmids_df, term_to_pmids"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"symptom_df, symptom_to_pmids = read_pmids_tsv('data/symptom-pmids.tsv.gz', key='mesh_id')\n",
"disease_df, disease_to_pmids = read_pmids_tsv('data/disease-pmids.tsv.gz', key='doid_code')"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1741776"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"symptom_pmids = set.union(*symptom_to_pmids.values())\n",
"len(symptom_pmids)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"3413567"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"disease_pmids = set.union(*disease_to_pmids.values())\n",
"len(disease_pmids)"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def score_pmid_cooccurrence(term0_to_pmids, term1_to_pmids, term0_name='term_0', term1_name='term_1'):\n",
" all_pmids0 = set.union(*term0_to_pmids.values())\n",
" all_pmids1 = set.union(*term1_to_pmids.values())\n",
" pmids_in_both = all_pmids0 & all_pmids1\n",
" total_pmids = len(pmids_in_both)\n",
" \n",
" term0_to_pmids = term0_to_pmids.copy()\n",
" term1_to_pmids = term1_to_pmids.copy()\n",
" for d in term0_to_pmids, term1_to_pmids:\n",
" for key, value in list(d.items()):\n",
" d[key] = value & pmids_in_both\n",
" if not d[key]:\n",
" del d[key]\n",
" \n",
" rows = list()\n",
" for term0, term1 in itertools.product(term0_to_pmids, term1_to_pmids):\n",
" pmids0 = term0_to_pmids[term0]\n",
" pmids1 = term1_to_pmids[term1]\n",
" count = len(pmids0 & pmids1)\n",
" expected = len(pmids0) * len(pmids1) / total_pmids\n",
" enrichment = count / expected\n",
" contingency_table = [[count, total_pmids - count], [expected, total_pmids - expected]]\n",
" oddsratio, pvalue = scipy.stats.fisher_exact(contingency_table, alternative='greater')\n",
" rows.append([term0, term1, count, expected, enrichment, oddsratio, pvalue])\n",
" columns = [term0_name, term1_name, 'cooccurrence', 'expected', 'enrichment', 'odds_ratio', 'p_fisher']\n",
" df = pandas.DataFrame(rows, columns=columns)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" doid_code | \n",
" doid_name | \n",
" mesh_id | \n",
" mesh_name | \n",
" cooccurrence | \n",
" expected | \n",
" enrichment | \n",
" odds_ratio | \n",
" p_fisher | \n",
"
\n",
" \n",
" \n",
" \n",
" 26450 | \n",
" DOID:10652 | \n",
" Alzheimer's disease | \n",
" D008569 | \n",
" Memory Disorders | \n",
" 1564 | \n",
" 67.100289 | \n",
" 23.308394 | \n",
" 23.432635 | \n",
" 0.000000e+00 | \n",
"
\n",
" \n",
" 26360 | \n",
" DOID:10652 | \n",
" Alzheimer's disease | \n",
" D004314 | \n",
" Down Syndrome | \n",
" 800 | \n",
" 32.051845 | \n",
" 24.959561 | \n",
" 25.048966 | \n",
" 1.443810e-193 | \n",
"
\n",
" \n",
" 26494 | \n",
" DOID:10652 | \n",
" Alzheimer's disease | \n",
" D011595 | \n",
" Psychomotor Agitation | \n",
" 331 | \n",
" 13.675650 | \n",
" 24.203603 | \n",
" 25.482125 | \n",
" 3.284937e-81 | \n",
"
\n",
" \n",
" 26299 | \n",
" DOID:10652 | \n",
" Alzheimer's disease | \n",
" D000647 | \n",
" Amnesia | \n",
" 303 | \n",
" 12.059837 | \n",
" 25.124717 | \n",
" 25.268675 | \n",
" 2.382544e-74 | \n",
"
\n",
" \n",
" 26423 | \n",
" DOID:10652 | \n",
" Alzheimer's disease | \n",
" D006816 | \n",
" Huntington Disease | \n",
" 251 | \n",
" 10.884701 | \n",
" 23.059889 | \n",
" 25.115362 | \n",
" 9.196812e-62 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" doid_code doid_name mesh_id mesh_name \\\n",
"26450 DOID:10652 Alzheimer's disease D008569 Memory Disorders \n",
"26360 DOID:10652 Alzheimer's disease D004314 Down Syndrome \n",
"26494 DOID:10652 Alzheimer's disease D011595 Psychomotor Agitation \n",
"26299 DOID:10652 Alzheimer's disease D000647 Amnesia \n",
"26423 DOID:10652 Alzheimer's disease D006816 Huntington Disease \n",
"\n",
" cooccurrence expected enrichment odds_ratio p_fisher \n",
"26450 1564 67.100289 23.308394 23.432635 0.000000e+00 \n",
"26360 800 32.051845 24.959561 25.048966 1.443810e-193 \n",
"26494 331 13.675650 24.203603 25.482125 3.284937e-81 \n",
"26299 303 12.059837 25.124717 25.268675 2.382544e-74 \n",
"26423 251 10.884701 23.059889 25.115362 9.196812e-62 "
]
},
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cooc_df = score_pmid_cooccurrence(disease_to_pmids, symptom_to_pmids, 'doid_code', 'mesh_id')\n",
"cooc_df = symptom_df[['mesh_id', 'mesh_name']].merge(cooc_df)\n",
"cooc_df = disease_df[['doid_code', 'doid_name']].merge(cooc_df)\n",
"cooc_df = cooc_df.sort(['doid_name', 'p_fisher'])\n",
"cooc_df.to_csv('data/disease-symptom-cooccurrence.tsv', index=False, sep='\\t')\n",
"cooc_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy\n",
"import scipy\n",
"import seaborn\n",
"import matplotlib.pyplot as plt\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/png": [
"iVBORw0KGgoAAAANSUhEUgAAAXYAAAECCAYAAADq7fyyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\n",
"AAALEgAACxIB0t1+/AAAERNJREFUeJzt3X2MZXV9x/H3QZmFhWVKqYtKjZPG7jfGFAvq2lKFJaUg\n",
"bQ1mY/9RadlUHqohNmlD60rd0kDYhGpSKpIGUNTUNpEsVEN4KGpc3D/cULUV2X4XarZ1UwN2hFmW\n",
"ZWWXnf5xz8Awex9m7t47597fvF/Jzd57zrnnfmd27ueee87voZqdnUWSVI7jmi5AkjRYBrskFcZg\n",
"l6TCGOySVBiDXZIKY7BLUmFe3W1lRBwPfA54I7AKuB7YBdwJHAEeBT6ambMRcTlwBXAYuD4z7x1i\n",
"3ZKkDnodsX8Q+Glmngu8B7gF+BSwuV5WAZdExGuBq4FzgIuAGyNiYnhlS5I66XrEDnwFuKu+fxxw\n",
"CDg7M7fXy+4DLgReBHZk5iHgUEQ8AZwJPDL4kiVJ3XQN9sx8DiAi1tAK+WuBv523ybPAJHAKMNNm\n",
"uSRpmfW8eBoRbwC+AXwxM/+J1rn1OacAzwD7gDXzlq8Bnh5gnZKkRep18fR04EHgI5n5zXrx9yLi\n",
"vMz8FnAx8HVgJ3BDRKwCTgDeTOvCajcHaV2QlSQtXtVzg26DgEXE3wF/AOS8xR8DbgYmgMeAy+tW\n",
"MR+m1SrmOOCGzLy7x2vPLqbAEWCdg2WdgzMONYJ1LruuwT5k4/JLtM7Bss7BGYcawTqXnR2UJKkw\n",
"BrskFcZgl6TCGOySVBiDXZIKY7BLUmEMdkkqjMEuSYUx2CWpMAa7JBXGYJekwvSaaENtVFU1AUx1\n",
"WL1ndnb2hWUsR5JewWDvz9T6jVty9eTaVyw8MPMUO7ddF8DuZsqSJIO9b6sn13LyqWc0XYYkHcVz\n",
"7JJUGINdkgpjsEtSYQx2SSqMwS5JhbFVTMN6tIk/vv73UIf1tpmXdBSDfYCOvHgYYKqqOs6H2y6I\n",
"27aJB5jeu4sT15xGu3W2mZfUicE+QAf3T7N+45YHlhrEndrEH5h50vbykpbMYB8wg1hS07x4KkmF\n",
"MdglqTAGuyQVxmCXpMIY7JJUGINdkgpjsEtSYQx2SSqMwS5JhTHYJakwBrskFcZgl6TCGOySVBhH\n",
"d1wmXcZqn1r2YiQVzWBfJp3Gap/eu6uhiiSVymBfRu3Gaj8w8+RAX6PHVHvgdHpS8VZ8sHcJwuMB\n",
"MpOIWLdgXbvtR0XHqfacTk9aGVZ8sNMhCOfmG71y60Ns2HRLLlw3ypzFSVrZDHY6nyLpNhepJI0q\n",
"mztKUmEMdkkqjMEuSYUx2CWpMAa7JBXGYJekwiyquWNEvBPYmpnnR8RZwNeAx+vVn83Mr0TE5cAV\n",
"wGHg+sy8dygVCzh67Jl5HammGixL0gjoGewRcQ3wIWB/vehtwKcz89PztnktcHW97kTg2xHxr5lp\n",
"1/UhWTj2zFxHqlHvPCVp+BZzxP4EsBH4Uv34bcC6iLiE1lH7nwLrgR2ZeQg4FBFPAGcCjwy+ZM1Z\n",
"jrFnJI2fnufYM3MbrdMrc74D/Hlmngf8CNgCrAFm5m3zLDA5wDolSYvUz8XTuzPze3P3gbOAfbTC\n",
"fc4a4OlF7Gu26VtmvmIcmNLVP2+Tv3Mafv2S6hyHGq1zOHV21U+w3x8R76jvX0DrdMtO4N0RsSoi\n",
"JoE3A48uYl9V07eIiD5+B2Or/nmb/J3T8OuXVOc41Gidw6mzq6UMAjb3SXEVcEtEHAJ+AlyRmfsj\n",
"4mbgYVofFpu9cDp6usziBI7TLhVjUcGemXuAc+r7/w68q802twO3D7I4DVanWZwcp10qi8P2rjCO\n",
"1S6Vz56nklQYg12SCmOwS1JhDHZJKozBLkmFMdglqTAGuyQVxmCXpMIY7JJUGINdkgpjsEtSYQx2\n",
"SSqMwS5JhTHYJakwBrskFcZgl6TCGOySVBiDXZIKY7BLUmEMdkkqjMEuSYUx2CWpMAa7JBXGYJek\n",
"whjsklQYg12SCmOwS1JhDHZJKozBLkmFeXXTBSyHqqomgKkOqzstXzGOvHgYYKqqqk6b7JmdnX1h\n",
"+SqSdCxWRLADU+s3bsnVk2uPWjG9d1cD5YyWg/unWb9xywPtfj8HZp5i57brAti9/JVJ6sdKCXZW\n",
"T67l5FPPOGr5gZknG6hm9HT6/UgaP55jl6TCGOySVBiDXZIKY7BLUmEMdkkqjMEuSYUx2CWpMAa7\n",
"JBXGYJekwhjsklQYg12SCmOwS1JhDHZJKozBLkmFMdglqTCLGo89It4JbM3M8yPiTcCdwBHgUeCj\n",
"mTkbEZcDVwCHgesz894h1SxJ6qLnEXtEXAPcBqyqF30a2JyZ5wIVcElEvBa4GjgHuAi4MSImhlOy\n",
"JKmbxZyKeQLYSCvEAc7OzO31/fuAC4B3ADsy81Bm7qufc+agi5Uk9dbzVExmbouIqXmL5s94/Cww\n",
"CZwCzLRZrjHXY6JrJ7mWRlA/c54emXf/FOAZYB+wZt7yNcDTi9jXbB+vv2SZyZVbH1qOlypOp4mu\n",
"D8w8xZdu/EC/u12W//cBGIc6x6FGsM5BanuUNV8/rWK+FxHn1fcvBrYDO4F3R8SqiJgE3kzrwupi\n",
"Chz6LSKij59TtbmJruffVk+upf69LvX/gz6e08RtHOochxqtczh1drWUI/a5T7I/A26rL44+BtxV\n",
"t4q5GXiY1ofF5sz0K7okNWBRwZ6Ze2i1eCEzHwc2tNnmduD2AdYmSeqDHZQkqTAGuyQVxmCXpMIY\n",
"7JJUmH7asUu9Oi5Bm85LVVVNZCYRsW6xz5G0dAa7+tKp4xK0Oi/t3HZdALsXrJq69ONfZsOmW3IJ\n",
"z5G0RAa7+jbXcWnYz5G0NEUFe1VVE8BUm1XtlklSkYoKdmBq/cYtufD0wPTeXQ2VI0nLr7Rgb/tV\n",
"/8DMkw1VI0nLz+aOklQYg12SCmOwS1JhijvHruZ16bw0tezFSCuQwa6B69R5ydZJ0vIw2DUUtk6S\n",
"muM5dkkqjMEuSYUx2CWpMAa7JBXGYJekwhjsklQYg12SCmOwS1JhDHZJKow9TzUSekyO7STX0hIY\n",
"7BoJncaXcZJraekMdo0MJ7qWBsNz7JJUGINdkgpjsEtSYQx2SSqMwS5JhTHYJakwBrskFcZgl6TC\n",
"GOySVBiDXZIKY7BLUmEMdkkqjMEuSYUx2CWpMAa7JBXGYJekwhjsklQYg12SCmOwS1JhDHZJKkzf\n",
"k1lHxHeBmfrhj4AbgTuBI8CjwEczc/ZYC5QkLU1fwR4RJwBk5vnzln0V2JyZ2yPiVuAS4J6BVClJ\n",
"WrR+j9jfCqyOiAfqfXwCODszt9fr7wMuxGCXpGXXb7A/B9yUmXdExK8C9y9Yvx+YPKbKJODIi4cB\n",
"pqqq6rTJntnZ2ReWryJp9PUb7LuBJwAy8/GImAbOmrd+DfDMIvYz0HPwmcmVWx8a5C7VsIP7p1m/\n",
"ccsDqyfXHrXuwMxTfOnGDwzz5cfhGtE41AjWOUgdj3Lm9NsqZhPwKYCIeD2tIH8wIs6r118MbO/w\n",
"3IUFDuwWEdHnz6MRtnpyLSefesZRt9WTa6n/zwf6d8TLb5xh7Hel1Widw6mzq36P2O8APh8Rc+G9\n",
"CZgGbouICeAx4K4+9y1JOgZ9BXtmHgYubbNqwzFVI0k6ZnZQkqTCGOySVBiDXZIKY7BLUmEMdkkq\n",
"TN+DgElN69Er1R6pWrHGLtirqpoApjqs7rRcBerUK/XAzFPs3HZd0OohLa04YxfswNT6jVuyXRfz\n",
"6b27GihHTZrrlSrpZeMY7B3fzAdmnmygGo0aBw7TSjeWwS5102vgME/TqHQGu4rkKRqtZDZ3lKTC\n",
"GOxSraqqiaqq1lVVtW737t3M3a9vE03XJy2Wp2K0ovS4sDo1d27+yq0PsWHTLQmel9f4Mdi1onS7\n",
"sDq9d5fn5lUEg10rjs1lVTrPsUtSYQx2SSqMwS5JhTHYJakwBrskFcZgl6TCGOySVBiDXZIKYwcl\n",
"6Rj0mNHLcd/VCINdOjZtZ/RyfBk1yWCXjlG7IQqcxUlNMtilIXAWJzVpZIO9y7nLdsukkeNIkWrK\n",
"yAY7Hc5dTu/d1VA5kjQeRjnY2x7xOLSqJHVnO3ZJKozBLkmFGelTMVKJejSFtBmkjpnBLvXQawLs\n",
"pe6vU1NIm0FqUAx2qYdeE2D3w6aQGiaDXVoEJ8DWOPHiqSQVxmCXpMIY7JJUGINdGgNVVU1UVbVu\n",
"/m337t3U9yeark+jxYun0ng4auykK7c+xNvfd20+cs/1F1VVtafD82wXvwIZ7NKY6DR2ksMDayGD\n",
"XRpzS20T32M6P/Aof+w1GuxVqyvfSR1Wn7CctUhNG3QP1y7aDokNHuWXoukj9uPftP79Pzztl9+y\n",
"euGKHz/2zf9uoiCpKcPo4dqJPV/L1nSwc+rrYtVrps76pYXLf/a/u/Y0UI7UqEH2cO3yDWCqj9I0\n",
"RgYa7BFxHPBZ4Ezg58CHM/O/Bvkakhan0zeAbkf/jjxZhkEfsb8PmMjMcyLincCn6mWSGrDUWcgc\n",
"ebIMgw723wLuB8jM70TE2we8f0lD1u7DoMeRPCzxaN6WOcM16GA/Bdg37/GLEXFcZh4Z8OtIWkbd\n",
"Luz2eTTfeMuchR8umUlErKsfDv2DZZgfboMO9n3AmnmPe4b6T//n+08fPvT8/oXLn53+8QsHZp46\n",
"avvnn/0Z0P6oodO60p4zCjWU9pxRqGHUn3PimtPaPqfW8Wh+QWC+tH23nXXb3wBN/doFf/LACSf/\n",
"IgAf+osv8o73fSIP7v8ZP3jo1m69eYfy+vPVNfT94VbNzs4ea3EviYiNwHszc1NE/AbwV5n5ewN7\n",
"AUlST4M+Yr8b+J2I2FE/3jTg/UuSehjoEbskqXkO2ytJhTHYJakwBrskFcZgl6TCLPsgYOM2nkw9\n",
"NMLWzDy/6VraiYjjgc8BbwRWAddn5teareqVIuJVwG3AOmAWuCozf9hsVZ1FxFrg34DfzsyR7EIf\n",
"Ed8FZuqHP8rMP26ynk4i4uPAe4Hjgc9k5hcaLukoEfFHwGX1wxOBtwKnZ+a+jk9qQJ2dt9N6Hx0B\n",
"Ls/MbLdtE0fsL40nA/wlrfFkRlJEXEMrkFY1XUsXHwR+mpnnAu8BPtNwPe38PnAkM98FXAvc0HA9\n",
"HdUflP8APNd0LZ1ExAkAmXl+fRvVUN8A/Gb9Xt8A/EqjBXWQmV+Y+10CjwBXj1qo1y4ETqrfR39D\n",
"l/dRE8H+ivFkgFEeT+YJYCOduumNhq8An6zvHwccbrCWtjLzX4Ar64dTwNPNVdPTTcCtwE+aLqSL\n",
"twKrI+KBiPh6/a1yFF0I/CAi7gG+Bny14Xq6qse2ektm3t50LR08D0xGRAVMAh2HG2gi2NuOJ9NA\n",
"HT1l5jZGMCjny8znMnN/RKyhFfKfaLqmdjLzxYi4E7gZ+HLD5bQVEZfR+vbzYL1oVD/QnwNuysyL\n",
"gKuAfxzR99BrgLcB76eus9lyetoM/HXTRXSxg9bMcv9J61vl33fasIk/hiWPJ6PuIuINwDeAL2bm\n",
"PzddTyeZeRmt84O3RcSJDZfTziZaPae/Cfw68IWIOL3hmtrZTR2Smfk4MA28rtGK2vs/4MHMPFxf\n",
"qzgYEUdNqjMKIuIXgHWZ+a2ma+niGmBHZgYv/31OtNuwiWDfAfwuQD2ezH80UEMx6uB5ELgmM+9s\n",
"uJy2IuLS+iIatL5OHqlvIyUzz8vMDfW51u8Df5iZS5+6aPg2UV+biojX0/oWPIqnjr5N67rPXJ0n\n",
"0foQGkXnAl9vuogeTuLlsx1P07og/ap2GzYxNd44jiczyuMubKZ1vu2TETF3rv3izDzYYE0L3QXc\n",
"GRHfovXH+LHM/HnDNY2zO4DPR8T2+vGmUfzWm5n3RsS5EbGT1kHkRzJzVN9L64CRbZ1Xu4nW//vD\n",
"tN5HH8/M59tt6FgxklSYUbzgIkk6Bga7JBXGYJekwhjsklQYg12SCmOwS1JhDHZJKozBLkmF+X9n\n",
"0IqGsnvUWAAAAABJRU5ErkJggg==\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sig_df = cooc_df[cooc_df.p_fisher < 0.05]\n",
"plt.hist(list(numpy.log(sig_df.enrichment)), bins = 50);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}