{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extract disease-gene associations from the GWAS Catalog\n",
    "\n",
    "Method based on our [previousely reported protocal](https://dx.doi.org/10.1101/011569), but that replaces HapMap linkage data from DAPPLE with 1000 Genomes Pilot data from SNAP:\n",
    "\n",
    "1. Start with the GWAS Catalog produced by the EBI, which included EFO mappings for the disease/trait\n",
    "2. Convert EFO terms into disease ontology terms that are in DO Slim\n",
    "3. Identify SNPs in LD with lead SNPs using a manual SNAP query\n",
    "4. Identify a window for each lead SNP based on the furthest upstream and downstream SNPs with $r^2 \\geq 0.5$.\n",
    "5. For each disease, overlap windows, in order of significance, into loci.\n",
    "6. Classify a loci as high confidence if the sample size was at least 1000 and $p \\leq 5 \\times 10^{-8}$.\n",
    "7. For each loci, identify a primary gene and classify the rest as secondary. Only protein-coding genes are considered."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import re\n",
    "import json\n",
    "import math\n",
    "import collections\n",
    "\n",
    "import requests\n",
    "import numpy\n",
    "import pandas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load GWAS Catalog"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Read EBI's GWAS Catalog with ontology annotations\n",
    "# https://www.ebi.ac.uk/gwas/docs/fileheaders\n",
    "path = 'download/gwas_catalog_v1.0.1-downloaded_2015-06-08.tsv.gz'\n",
    "dtype_dict = {'UPSTREAM_GENE_ID': numpy.character,\n",
    "              'DOWNSTREAM_GENE_ID': numpy.character,\n",
    "              'SNP_GENE_IDS': numpy.character}\n",
    "ebi_df = pandas.read_table(path, compression='gzip', low_memory=False, dtype=dtype_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>DATE ADDED TO CATALOG</th>\n",
       "      <th>PUBMEDID</th>\n",
       "      <th>FIRST AUTHOR</th>\n",
       "      <th>DATE</th>\n",
       "      <th>JOURNAL</th>\n",
       "      <th>LINK</th>\n",
       "      <th>STUDY</th>\n",
       "      <th>DISEASE/TRAIT</th>\n",
       "      <th>INITIAL SAMPLE DESCRIPTION</th>\n",
       "      <th>REPLICATION SAMPLE DESCRIPTION</th>\n",
       "      <th>...</th>\n",
       "      <th>RISK ALLELE FREQUENCY</th>\n",
       "      <th>P-VALUE</th>\n",
       "      <th>PVALUE_MLOG</th>\n",
       "      <th>P-VALUE (TEXT)</th>\n",
       "      <th>OR or BETA</th>\n",
       "      <th>95% CI (TEXT)</th>\n",
       "      <th>PLATFORM [SNPS PASSING QC]</th>\n",
       "      <th>CNV</th>\n",
       "      <th>MAPPED_TRAIT</th>\n",
       "      <th>MAPPED_TRAIT_URI</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>25231870</td>\n",
       "      <td>Perry JR</td>\n",
       "      <td>23-Jul-2014</td>\n",
       "      <td>Nature</td>\n",
       "      <td>http://europepmc.org/abstract/MED/25231870</td>\n",
       "      <td>Parent-of-origin-specific allelic associations...</td>\n",
       "      <td>Menarche (age at onset)</td>\n",
       "      <td>Up to 182,413 European ancestry women</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.47</td>\n",
       "      <td>2.000000e-12</td>\n",
       "      <td>11.698970004336019</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.04</td>\n",
       "      <td>[0.03-0.05] (unit increase)</td>\n",
       "      <td>Illumina &amp; Affymetrix [2,441,815] (imputed)</td>\n",
       "      <td>N</td>\n",
       "      <td>age at menarche</td>\n",
       "      <td>http://www.ebi.ac.uk/efo/EFO_0004703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>25231870</td>\n",
       "      <td>Perry JR</td>\n",
       "      <td>23-Jul-2014</td>\n",
       "      <td>Nature</td>\n",
       "      <td>http://europepmc.org/abstract/MED/25231870</td>\n",
       "      <td>Parent-of-origin-specific allelic associations...</td>\n",
       "      <td>Menarche (age at onset)</td>\n",
       "      <td>Up to 182,413 European ancestry women</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.29</td>\n",
       "      <td>7.000000e-20</td>\n",
       "      <td>19.154901959985743</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.05</td>\n",
       "      <td>[0.038-0.062] (unit increase)</td>\n",
       "      <td>Illumina &amp; Affymetrix [2,441,815] (imputed)</td>\n",
       "      <td>N</td>\n",
       "      <td>age at menarche</td>\n",
       "      <td>http://www.ebi.ac.uk/efo/EFO_0004703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>25231870</td>\n",
       "      <td>Perry JR</td>\n",
       "      <td>23-Jul-2014</td>\n",
       "      <td>Nature</td>\n",
       "      <td>http://europepmc.org/abstract/MED/25231870</td>\n",
       "      <td>Parent-of-origin-specific allelic associations...</td>\n",
       "      <td>Menarche (age at onset)</td>\n",
       "      <td>Up to 182,413 European ancestry women</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.29</td>\n",
       "      <td>7.000000e-20</td>\n",
       "      <td>19.154901959985743</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.05</td>\n",
       "      <td>[0.038-0.062] (unit increase)</td>\n",
       "      <td>Illumina &amp; Affymetrix [2,441,815] (imputed)</td>\n",
       "      <td>N</td>\n",
       "      <td>age at menarche</td>\n",
       "      <td>http://www.ebi.ac.uk/efo/EFO_0004703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>25231870</td>\n",
       "      <td>Perry JR</td>\n",
       "      <td>23-Jul-2014</td>\n",
       "      <td>Nature</td>\n",
       "      <td>http://europepmc.org/abstract/MED/25231870</td>\n",
       "      <td>Parent-of-origin-specific allelic associations...</td>\n",
       "      <td>Menarche (age at onset)</td>\n",
       "      <td>Up to 182,413 European ancestry women</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.46</td>\n",
       "      <td>3.000000e-08</td>\n",
       "      <td>7.522878745280337</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.03</td>\n",
       "      <td>[0.02-0.04] (unit increase)</td>\n",
       "      <td>Illumina &amp; Affymetrix [2,441,815] (imputed)</td>\n",
       "      <td>N</td>\n",
       "      <td>age at menarche</td>\n",
       "      <td>http://www.ebi.ac.uk/efo/EFO_0004703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>25231870</td>\n",
       "      <td>Perry JR</td>\n",
       "      <td>23-Jul-2014</td>\n",
       "      <td>Nature</td>\n",
       "      <td>http://europepmc.org/abstract/MED/25231870</td>\n",
       "      <td>Parent-of-origin-specific allelic associations...</td>\n",
       "      <td>Menarche (age at onset)</td>\n",
       "      <td>Up to 182,413 European ancestry women</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.46</td>\n",
       "      <td>3.000000e-08</td>\n",
       "      <td>7.522878745280337</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.03</td>\n",
       "      <td>[0.02-0.04] (unit increase)</td>\n",
       "      <td>Illumina &amp; Affymetrix [2,441,815] (imputed)</td>\n",
       "      <td>N</td>\n",
       "      <td>age at menarche</td>\n",
       "      <td>http://www.ebi.ac.uk/efo/EFO_0004703</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 36 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  DATE ADDED TO CATALOG  PUBMEDID FIRST AUTHOR         DATE JOURNAL  \\\n",
       "0                   NaN  25231870     Perry JR  23-Jul-2014  Nature   \n",
       "1                   NaN  25231870     Perry JR  23-Jul-2014  Nature   \n",
       "2                   NaN  25231870     Perry JR  23-Jul-2014  Nature   \n",
       "3                   NaN  25231870     Perry JR  23-Jul-2014  Nature   \n",
       "4                   NaN  25231870     Perry JR  23-Jul-2014  Nature   \n",
       "\n",
       "                                         LINK  \\\n",
       "0  http://europepmc.org/abstract/MED/25231870   \n",
       "1  http://europepmc.org/abstract/MED/25231870   \n",
       "2  http://europepmc.org/abstract/MED/25231870   \n",
       "3  http://europepmc.org/abstract/MED/25231870   \n",
       "4  http://europepmc.org/abstract/MED/25231870   \n",
       "\n",
       "                                               STUDY            DISEASE/TRAIT  \\\n",
       "0  Parent-of-origin-specific allelic associations...  Menarche (age at onset)   \n",
       "1  Parent-of-origin-specific allelic associations...  Menarche (age at onset)   \n",
       "2  Parent-of-origin-specific allelic associations...  Menarche (age at onset)   \n",
       "3  Parent-of-origin-specific allelic associations...  Menarche (age at onset)   \n",
       "4  Parent-of-origin-specific allelic associations...  Menarche (age at onset)   \n",
       "\n",
       "              INITIAL SAMPLE DESCRIPTION REPLICATION SAMPLE DESCRIPTION  \\\n",
       "0  Up to 182,413 European ancestry women                            NaN   \n",
       "1  Up to 182,413 European ancestry women                            NaN   \n",
       "2  Up to 182,413 European ancestry women                            NaN   \n",
       "3  Up to 182,413 European ancestry women                            NaN   \n",
       "4  Up to 182,413 European ancestry women                            NaN   \n",
       "\n",
       "                   ...                  RISK ALLELE FREQUENCY       P-VALUE  \\\n",
       "0                  ...                                   0.47  2.000000e-12   \n",
       "1                  ...                                   0.29  7.000000e-20   \n",
       "2                  ...                                   0.29  7.000000e-20   \n",
       "3                  ...                                   0.46  3.000000e-08   \n",
       "4                  ...                                   0.46  3.000000e-08   \n",
       "\n",
       "          PVALUE_MLOG P-VALUE (TEXT) OR or BETA  \\\n",
       "0  11.698970004336019            NaN       0.04   \n",
       "1  19.154901959985743            NaN       0.05   \n",
       "2  19.154901959985743            NaN       0.05   \n",
       "3   7.522878745280337            NaN       0.03   \n",
       "4   7.522878745280337            NaN       0.03   \n",
       "\n",
       "                   95% CI (TEXT)                   PLATFORM [SNPS PASSING QC]  \\\n",
       "0    [0.03-0.05] (unit increase)  Illumina & Affymetrix [2,441,815] (imputed)   \n",
       "1  [0.038-0.062] (unit increase)  Illumina & Affymetrix [2,441,815] (imputed)   \n",
       "2  [0.038-0.062] (unit increase)  Illumina & Affymetrix [2,441,815] (imputed)   \n",
       "3    [0.02-0.04] (unit increase)  Illumina & Affymetrix [2,441,815] (imputed)   \n",
       "4    [0.02-0.04] (unit increase)  Illumina & Affymetrix [2,441,815] (imputed)   \n",
       "\n",
       "  CNV     MAPPED_TRAIT                      MAPPED_TRAIT_URI  \n",
       "0   N  age at menarche  http://www.ebi.ac.uk/efo/EFO_0004703  \n",
       "1   N  age at menarche  http://www.ebi.ac.uk/efo/EFO_0004703  \n",
       "2   N  age at menarche  http://www.ebi.ac.uk/efo/EFO_0004703  \n",
       "3   N  age at menarche  http://www.ebi.ac.uk/efo/EFO_0004703  \n",
       "4   N  age at menarche  http://www.ebi.ac.uk/efo/EFO_0004703  \n",
       "\n",
       "[5 rows x 36 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ebi_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Entrez Gene information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Read entrez symbol to gene mapping\n",
    "url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/symbols-human.json'\n",
    "symbol_to_id = requests.get(url).json()\n",
    "\n",
    "# Read entrez synonym to genes mapping\n",
    "url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/synonyms-human.json'\n",
    "synonym_to_ids = requests.get(url).json()\n",
    "\n",
    "# Read entrez genes\n",
    "url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/genes-human.tsv'\n",
    "entrez_df = pandas.read_table(url)\n",
    "\n",
    "# Create a set of coding genes\n",
    "coding_genes = set(entrez_df.GeneID[entrez_df.type_of_gene == 'protein-coding'].astype(str))\n",
    "len(coding_genes)\n",
    "\n",
    "# Create a symbol dataframe\n",
    "symbol_df = entrez_df[['GeneID', 'Symbol']].rename(columns={'GeneID': 'gene', 'Symbol': 'symbol'})\n",
    "symbol_df.gene = symbol_df.gene.astype(str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert from EFO to Disease Ontology"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Create a uri_df (for cross-references)\n",
    "rows = list()\n",
    "for uri in filter(pandas.notnull, set(ebi_df['MAPPED_TRAIT_URI'])):\n",
    "    head, tail = uri.rsplit('/', 1)\n",
    "    resource, resource_id = tail.split('_', 1)\n",
    "    rows.append([uri, resource, resource_id])\n",
    "    \n",
    "uri_df = pandas.DataFrame(rows, columns=['MAPPED_TRAIT_URI', 'resource', 'resource_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Read DO Slim propagated cross-references\n",
    "url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/xrefs-prop-slim.tsv'\n",
    "doxref_df = pandas.read_table(url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10342"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Inner join the GWAS catalog with the DO slim mapping \n",
    "map_df = uri_df.merge(doxref_df)\n",
    "map_df = map_df[['MAPPED_TRAIT_URI', 'doid_code', 'doid_name']]\n",
    "ebi_df = ebi_df.merge(map_df)\n",
    "len(ebi_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Identify lead SNPs and proxy search using SNAP\n",
    "\n",
    "The SNAP proxy search must be run manually. Windows are computed in a companion notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5255"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Write all lead SNPs for SNAP input\n",
    "gwas_snps = set('rs{}'.format(x) for x in ebi_df.SNP_ID_CURRENT if pandas.notnull(x))\n",
    "gwas_snps = sorted(gwas_snps)\n",
    "with open('data/snap/do-slim-lead-SNPs.txt', 'w') as write_file:\n",
    "    write_file.write('\\n'.join(gwas_snps))\n",
    "len(gwas_snps)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process associations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "HC_mlogp = -math.log10(5e-8)\n",
    "HC_samples = 1000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def extract_sample_size(text):\n",
    "    pattern_cases = r'([\\d,]+)[^\\d]*?cases'\n",
    "    pattern_contols = r'([\\d,]+)[^\\d]*?controls'\n",
    "    pattern_samples = r'[\\d,]+'\n",
    "    \n",
    "    remove_commas = lambda x: int(x.replace(',', ''))\n",
    "    \n",
    "    sample_size = {'cases': None, 'controls': None, 'samples': None,}\n",
    "    \n",
    "    for key, pattern in ('cases', pattern_cases), ('controls', pattern_contols):\n",
    "        try:\n",
    "            match = re.search(pattern, text)\n",
    "            sample_size[key] = remove_commas(match.group(1))\n",
    "        except (IndexError, AttributeError, ValueError):\n",
    "            continue\n",
    "    \n",
    "    if sample_size['cases'] is not None and sample_size['controls'] is not None:\n",
    "        sample_size['samples'] = sample_size['cases'] + sample_size['controls']\n",
    "    \n",
    "    if sample_size['cases'] is None and sample_size['controls'] is None:\n",
    "        try:\n",
    "            match = re.search(pattern_samples, text)\n",
    "            sample_size['samples'] = remove_commas(match.group(0))\n",
    "        except (IndexError, AttributeError, ValueError):\n",
    "            pass\n",
    "    \n",
    "    return sample_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def process_reported_symbols(text):\n",
    "    if pandas.isnull(text):\n",
    "        return set()\n",
    "    symbols = set(re.split(r'[, ]+', text))\n",
    "    symbols -= {'Intergenic', 'NR', 'Pending'}\n",
    "    entrez_ids = set()\n",
    "    for symbol in symbols:\n",
    "        gene_id = symbol_to_id.get(symbol)\n",
    "        if not gene_id:\n",
    "            gene_ids = synonym_to_ids.get(symbol, [])\n",
    "            if len(gene_ids) != 1:\n",
    "                continue\n",
    "            gene_id, = gene_ids\n",
    "        entrez_ids.add(str(gene_id))\n",
    "    return entrez_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>doid_code</th>\n",
       "      <th>doid_name</th>\n",
       "      <th>reported_symbols</th>\n",
       "      <th>pubmed_id</th>\n",
       "      <th>upstream_gene</th>\n",
       "      <th>date</th>\n",
       "      <th>containing_genes</th>\n",
       "      <th>lead_snp</th>\n",
       "      <th>mlog_pvalue</th>\n",
       "      <th>downstream_gene</th>\n",
       "      <th>reported_genes</th>\n",
       "      <th>cases</th>\n",
       "      <th>controls</th>\n",
       "      <th>samples</th>\n",
       "      <th>high_confidence</th>\n",
       "      <th>lead_chrom</th>\n",
       "      <th>lower_coord</th>\n",
       "      <th>upper_coord</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>DOID:1686</td>\n",
       "      <td>glaucoma</td>\n",
       "      <td>ABCA1</td>\n",
       "      <td>25173107</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2014-08-31</td>\n",
       "      <td>set([])</td>\n",
       "      <td>rs2487032</td>\n",
       "      <td>13.154902</td>\n",
       "      <td>NaN</td>\n",
       "      <td>set([19])</td>\n",
       "      <td>966</td>\n",
       "      <td>1005</td>\n",
       "      <td>1971</td>\n",
       "      <td>1</td>\n",
       "      <td>chr9</td>\n",
       "      <td>106731644</td>\n",
       "      <td>106747174</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DOID:1686</td>\n",
       "      <td>glaucoma</td>\n",
       "      <td>PMM2</td>\n",
       "      <td>25173107</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2014-08-31</td>\n",
       "      <td>set([])</td>\n",
       "      <td>rs3785176</td>\n",
       "      <td>6.698970</td>\n",
       "      <td>NaN</td>\n",
       "      <td>set([5373])</td>\n",
       "      <td>966</td>\n",
       "      <td>1005</td>\n",
       "      <td>1971</td>\n",
       "      <td>0</td>\n",
       "      <td>chr16</td>\n",
       "      <td>8804432</td>\n",
       "      <td>8854102</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>DOID:1686</td>\n",
       "      <td>glaucoma</td>\n",
       "      <td>CDKN2B-AS1</td>\n",
       "      <td>25173105</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2014-08-31</td>\n",
       "      <td>set([100048912])</td>\n",
       "      <td>rs4977756</td>\n",
       "      <td>29.154902</td>\n",
       "      <td>NaN</td>\n",
       "      <td>set([100048912])</td>\n",
       "      <td>1155</td>\n",
       "      <td>1922</td>\n",
       "      <td>3077</td>\n",
       "      <td>1</td>\n",
       "      <td>chr9</td>\n",
       "      <td>21993367</td>\n",
       "      <td>22111349</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>DOID:1686</td>\n",
       "      <td>glaucoma</td>\n",
       "      <td>CDKN2B-AS1</td>\n",
       "      <td>25173105</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2014-08-31</td>\n",
       "      <td>set([102724137])</td>\n",
       "      <td>rs4977756</td>\n",
       "      <td>29.154902</td>\n",
       "      <td>NaN</td>\n",
       "      <td>set([100048912])</td>\n",
       "      <td>1155</td>\n",
       "      <td>1922</td>\n",
       "      <td>3077</td>\n",
       "      <td>1</td>\n",
       "      <td>chr9</td>\n",
       "      <td>21993367</td>\n",
       "      <td>22111349</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>DOID:1686</td>\n",
       "      <td>glaucoma</td>\n",
       "      <td>CDKN2B-AS1</td>\n",
       "      <td>21532571</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2011-05-01</td>\n",
       "      <td>set([100048912])</td>\n",
       "      <td>rs4977756</td>\n",
       "      <td>14.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>set([100048912])</td>\n",
       "      <td>590</td>\n",
       "      <td>3956</td>\n",
       "      <td>4546</td>\n",
       "      <td>1</td>\n",
       "      <td>chr9</td>\n",
       "      <td>21993367</td>\n",
       "      <td>22111349</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   doid_code doid_name reported_symbols  pubmed_id upstream_gene       date  \\\n",
       "0  DOID:1686  glaucoma            ABCA1   25173107           NaN 2014-08-31   \n",
       "1  DOID:1686  glaucoma             PMM2   25173107           NaN 2014-08-31   \n",
       "2  DOID:1686  glaucoma       CDKN2B-AS1   25173105           NaN 2014-08-31   \n",
       "3  DOID:1686  glaucoma       CDKN2B-AS1   25173105           NaN 2014-08-31   \n",
       "4  DOID:1686  glaucoma       CDKN2B-AS1   21532571           NaN 2011-05-01   \n",
       "\n",
       "   containing_genes   lead_snp  mlog_pvalue downstream_gene    reported_genes  \\\n",
       "0           set([])  rs2487032    13.154902             NaN         set([19])   \n",
       "1           set([])  rs3785176     6.698970             NaN       set([5373])   \n",
       "2  set([100048912])  rs4977756    29.154902             NaN  set([100048912])   \n",
       "3  set([102724137])  rs4977756    29.154902             NaN  set([100048912])   \n",
       "4  set([100048912])  rs4977756    14.000000             NaN  set([100048912])   \n",
       "\n",
       "   cases  controls  samples  high_confidence lead_chrom  lower_coord  \\\n",
       "0    966      1005     1971                1       chr9    106731644   \n",
       "1    966      1005     1971                0      chr16      8804432   \n",
       "2   1155      1922     3077                1       chr9     21993367   \n",
       "3   1155      1922     3077                1       chr9     21993367   \n",
       "4    590      3956     4546                1       chr9     21993367   \n",
       "\n",
       "   upper_coord  \n",
       "0    106747174  \n",
       "1      8854102  \n",
       "2     22111349  \n",
       "3     22111349  \n",
       "4     22111349  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# rename and order columns\n",
    "rename_dict = {'SNP_ID_CURRENT': 'lead_snp', 'REPORTED GENE(S)': 'symbols', 'PUBMEDID': 'pubmed_id',\n",
    "              'INITIAL SAMPLE DESCRIPTION': 'sample_description', 'PVALUE_MLOG': 'mlog_pvalue',\n",
    "              'DATE': 'date', 'REPORTED GENE(S)': 'reported_symbols', 'UPSTREAM_GENE_ID': 'upstream_gene',\n",
    "              'DOWNSTREAM_GENE_ID': 'downstream_gene', 'SNP_GENE_IDS': 'containing_genes'}\n",
    "columns = ['doid_code', 'doid_name'] + list(rename_dict.values())\n",
    "association_df = ebi_df.rename(columns = rename_dict)[columns]\n",
    "\n",
    "# split containing genes\n",
    "association_df.containing_genes = association_df.containing_genes.map(\n",
    "    lambda x: set(x.split(';')) if pandas.notnull(x) else set())\n",
    "\n",
    "# convert reported symbols to entrez GeneIDs\n",
    "association_df['reported_genes'] = association_df.reported_symbols.map(process_reported_symbols)\n",
    "\n",
    "# convert dates\n",
    "association_df.date = pandas.to_datetime(association_df.date)\n",
    "\n",
    "# Add rs to SNP names\n",
    "association_df.lead_snp = 'rs' + association_df.lead_snp\n",
    "\n",
    "# extract sample size information\n",
    "sample_size_df = pandas.DataFrame(list(map(extract_sample_size, association_df.sample_description)))\n",
    "association_df = association_df.drop(['sample_description'], axis=1)\n",
    "association_df = pandas.concat([association_df, sample_size_df], axis=1)\n",
    "\n",
    "# convert mlog_pavlue to numeric\n",
    "association_df.mlog_pvalue = association_df.mlog_pvalue.convert_objects(convert_numeric=True)\n",
    "association_df['high_confidence'] = (\n",
    "    (association_df.mlog_pvalue >= HC_mlogp) &\n",
    "    (association_df.samples >= HC_samples)).astype(int)\n",
    "\n",
    "# Add window information\n",
    "window_df = pandas.read_table('data/snap/windows.tsv')\n",
    "association_df = association_df.merge(window_df[['lead_snp', 'lead_chrom', 'lower_coord', 'upper_coord']])\n",
    "\n",
    "association_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Identify disease-specific loci"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def identify_loci(disease_df):\n",
    "    disease_df = disease_df.sort(['high_confidence', 'mlog_pvalue', 'date'], ascending=False)\n",
    "    \n",
    "    loci = dict()\n",
    "    \n",
    "    for i, row in disease_df.iterrows():\n",
    "        interval = row.lead_chrom, row.lower_coord, row.upper_coord\n",
    "        for chrom, lower, upper in loci:\n",
    "            if interval[0] == chrom and (\n",
    "                (lower <= interval[1] and interval[1] <= upper) or\n",
    "                (lower <= interval[2] and interval[2] <= upper)):\n",
    "                loci[(chrom, lower, upper)].append(row)\n",
    "                break\n",
    "        else:\n",
    "            loci[interval] = [row]\n",
    "    \n",
    "    # convert each locus to a dataframe\n",
    "    loci = list(map(pandas.DataFrame, loci.values()))\n",
    "\n",
    "    for i, locus_df in enumerate(loci):\n",
    "        locus_df['locus'] = i\n",
    "    \n",
    "    return pandas.concat(loci)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "association_df = association_df.groupby('doid_code', as_index=False).apply(identify_loci)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Save associations to tsv\n",
    "association_write_df = association_df.copy()\n",
    "for column in ['reported_genes', 'containing_genes']:\n",
    "    association_write_df[column] = association_write_df[column].map(lambda x: '|'.join(x))\n",
    "association_write_df.to_csv('data/snp-associations.tsv', sep='\\t', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Annotate genes to loci "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def max_counter_keys(counter, key_subset=None):\n",
    "    if key_subset is not None:\n",
    "        counter = collections.Counter({k: v for\n",
    "            k, v in counter.items() if k in key_subset})\n",
    "    max_value = max(list(counter.values()) + [0])\n",
    "    max_keys = [k for k, v in counter.items() if v == max_value]\n",
    "    return max_keys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def associate_by_gene(locus_df):\n",
    "    \n",
    "    # sort associations by precedence\n",
    "    locus_df = locus_df.sort(['high_confidence', 'mlog_pvalue', 'date'], ascending=False)\n",
    "    \n",
    "    # Count the number of times each gene is reported across all studies in a loci\n",
    "    reported_counts = collections.Counter()\n",
    "    for genes in locus_df.reported_genes:\n",
    "        reported_counts.update(genes)\n",
    "    \n",
    "    primary_gene = None\n",
    "    \n",
    "    # Identify a single mode reported gene\n",
    "    mode_reported = max_counter_keys(reported_counts, coding_genes)\n",
    "    if len(mode_reported) == 1:\n",
    "        primary_gene, = mode_reported\n",
    "    \n",
    "    # Iterate through associations attempting to resolve primary-gene ambiguity\n",
    "    for i, row in locus_df.iterrows():\n",
    "        if primary_gene:\n",
    "            break\n",
    "\n",
    "        # Find containing genes\n",
    "        containing_genes = row.containing_genes & coding_genes\n",
    "        \n",
    "        # If a lead SNP is in a gene, consider the containing gene primary\n",
    "        if len(containing_genes) == 1:\n",
    "            primary_gene, = containing_genes\n",
    "            break\n",
    "        \n",
    "        # If the lead SNP is in multiple genes, take the mode reported of those genes\n",
    "        mode_reported = max_counter_keys(reported_counts, containing_genes)\n",
    "        if len(mode_reported) == 1:\n",
    "            primary_gene, = mode_reported\n",
    "            break\n",
    "\n",
    "        # Take the more reported gene from the upstream and downstream gene\n",
    "        stream_genes = {gene for gene in [row.downstream_gene, row.upstream_gene] if pandas.notnull(gene)}\n",
    "        mode_reported = max_counter_keys(reported_counts, coding_genes & stream_genes)\n",
    "        if len(mode_reported) == 1:\n",
    "            primary_gene, = mode_reported\n",
    "            break\n",
    "\n",
    "    # All genes except the primary are considered secondary\n",
    "    secondary_genes = set()\n",
    "    for column in ['reported_genes', 'containing_genes']:\n",
    "        for genes in locus_df[column]:\n",
    "            secondary_genes |= genes\n",
    "    secondary_genes |= set(locus_df.upstream_gene)\n",
    "    secondary_genes |= set(locus_df.downstream_gene)\n",
    "    secondary_genes = set(filter(pandas.notnull, secondary_genes))\n",
    "    secondary_genes.discard(primary_gene)\n",
    "    secondary_genes &= coding_genes\n",
    "    \n",
    "    # Create a dataframe to return\n",
    "    columns = ['doid_code', 'doid_name', 'locus', 'high_confidence', 'primary', 'gene']\n",
    "    rows = list()\n",
    "    first_row = locus_df.iloc[0]\n",
    "    base_row = first_row.doid_code, first_row.doid_name, first_row.locus, first_row.high_confidence\n",
    "    if primary_gene:\n",
    "        rows.append(base_row + (1, primary_gene))\n",
    "    for secondary_gene in secondary_genes:\n",
    "        rows.append(base_row + (0, secondary_gene))\n",
    "    gene_df = pandas.DataFrame(rows, columns=columns)\n",
    "    \n",
    "    return gene_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "grouped = association_df.groupby(['doid_code', 'locus'], as_index=False)\n",
    "gene_df = grouped.apply(associate_by_gene)\n",
    "for column in 'locus', 'high_confidence', 'primary':\n",
    "    gene_df[column] = gene_df[column].astype(int)\n",
    "status = gene_df.apply(lambda x: '{}C-{}'.format('H' if x.high_confidence else 'L', 'P' if x.primary else 'S'), axis=1)\n",
    "gene_df.insert(5, 'status', status)\n",
    "gene_df = gene_df.merge(symbol_df, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'274 associations removed that were duplicated across statuses'"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Remove duplicated associations \n",
    "gene_df = gene_df.sort(['doid_code', 'high_confidence', 'primary'], ascending=False)\n",
    "associations_with_dups = len(gene_df)\n",
    "gene_df = gene_df.drop_duplicates(['doid_code', 'gene'])\n",
    "'{} associations removed that were duplicated across statuses'.format(associations_with_dups - len(gene_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>doid_code</th>\n",
       "      <th>doid_name</th>\n",
       "      <th>locus</th>\n",
       "      <th>high_confidence</th>\n",
       "      <th>primary</th>\n",
       "      <th>status</th>\n",
       "      <th>gene</th>\n",
       "      <th>symbol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6097</th>\n",
       "      <td>DOID:9970</td>\n",
       "      <td>obesity</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>HC-P</td>\n",
       "      <td>3953</td>\n",
       "      <td>LEPR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6114</th>\n",
       "      <td>DOID:9970</td>\n",
       "      <td>obesity</td>\n",
       "      <td>14</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>HC-P</td>\n",
       "      <td>4094</td>\n",
       "      <td>MAF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6116</th>\n",
       "      <td>DOID:9970</td>\n",
       "      <td>obesity</td>\n",
       "      <td>15</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>HC-P</td>\n",
       "      <td>3667</td>\n",
       "      <td>IRS1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6123</th>\n",
       "      <td>DOID:9970</td>\n",
       "      <td>obesity</td>\n",
       "      <td>20</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>HC-P</td>\n",
       "      <td>7021</td>\n",
       "      <td>TFAP2B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6124</th>\n",
       "      <td>DOID:9970</td>\n",
       "      <td>obesity</td>\n",
       "      <td>21</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>HC-P</td>\n",
       "      <td>79068</td>\n",
       "      <td>FTO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      doid_code doid_name  locus  high_confidence  primary status   gene  \\\n",
       "6097  DOID:9970   obesity      0                1        1   HC-P   3953   \n",
       "6114  DOID:9970   obesity     14                1        1   HC-P   4094   \n",
       "6116  DOID:9970   obesity     15                1        1   HC-P   3667   \n",
       "6123  DOID:9970   obesity     20                1        1   HC-P   7021   \n",
       "6124  DOID:9970   obesity     21                1        1   HC-P  79068   \n",
       "\n",
       "      symbol  \n",
       "6097    LEPR  \n",
       "6114     MAF  \n",
       "6116    IRS1  \n",
       "6123  TFAP2B  \n",
       "6124     FTO  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# save gene_df as a tsv\n",
    "gene_df.to_csv('data/gene-associations.tsv', sep='\\t', index=False)\n",
    "gene_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Summary statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "statuses = ['HC-P', 'HC-S', 'LC-P', 'LC-S']\n",
    "\n",
    "def summarize_gene_df(df):\n",
    "    counter = collections.Counter(df.status)\n",
    "    series = pandas.Series()\n",
    "    for status in statuses:\n",
    "        series[status] = counter[status]\n",
    "    return series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>doid_code</th>\n",
       "      <th>doid_name</th>\n",
       "      <th>HC-P</th>\n",
       "      <th>HC-S</th>\n",
       "      <th>LC-P</th>\n",
       "      <th>LC-S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>DOID:8778</td>\n",
       "      <td>Crohn's disease</td>\n",
       "      <td>86</td>\n",
       "      <td>131</td>\n",
       "      <td>10</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72</th>\n",
       "      <td>DOID:5419</td>\n",
       "      <td>schizophrenia</td>\n",
       "      <td>79</td>\n",
       "      <td>236</td>\n",
       "      <td>76</td>\n",
       "      <td>57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>78</th>\n",
       "      <td>DOID:7148</td>\n",
       "      <td>rheumatoid arthritis</td>\n",
       "      <td>79</td>\n",
       "      <td>43</td>\n",
       "      <td>48</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>DOID:9352</td>\n",
       "      <td>type 2 diabetes mellitus</td>\n",
       "      <td>67</td>\n",
       "      <td>31</td>\n",
       "      <td>52</td>\n",
       "      <td>29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>DOID:1612</td>\n",
       "      <td>breast cancer</td>\n",
       "      <td>58</td>\n",
       "      <td>68</td>\n",
       "      <td>42</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    doid_code                 doid_name  HC-P  HC-S  LC-P  LC-S\n",
       "84  DOID:8778           Crohn's disease    86   131    10    13\n",
       "72  DOID:5419             schizophrenia    79   236    76    57\n",
       "78  DOID:7148      rheumatoid arthritis    79    43    48    30\n",
       "92  DOID:9352  type 2 diabetes mellitus    67    31    52    29\n",
       "43  DOID:1612             breast cancer    58    68    42    30"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Count the number of associations per disease\n",
    "disease_summary_df = gene_df.groupby(['doid_code', 'doid_name']).apply(summarize_gene_df)\n",
    "disease_summary_df = disease_summary_df.reset_index()\n",
    "disease_summary_df = disease_summary_df.sort(statuses, ascending=False)\n",
    "disease_summary_df.to_csv('data/diseases.tsv', sep='\\t', index=False)\n",
    "disease_summary_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gene</th>\n",
       "      <th>symbol</th>\n",
       "      <th>HC-P</th>\n",
       "      <th>HC-S</th>\n",
       "      <th>LC-P</th>\n",
       "      <th>LC-S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1971</th>\n",
       "      <td>4609</td>\n",
       "      <td>MYC</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3051</th>\n",
       "      <td>7015</td>\n",
       "      <td>TERT</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2564</th>\n",
       "      <td>57178</td>\n",
       "      <td>ZMIZ1</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1574</th>\n",
       "      <td>3394</td>\n",
       "      <td>IRF8</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1651</th>\n",
       "      <td>3559</td>\n",
       "      <td>IL2RA</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       gene symbol  HC-P  HC-S  LC-P  LC-S\n",
       "1971   4609    MYC     7     0     0     0\n",
       "3051   7015   TERT     6     1     0     0\n",
       "2564  57178  ZMIZ1     6     0     5     0\n",
       "1574   3394   IRF8     6     0     1     0\n",
       "1651   3559  IL2RA     6     0     0     2"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Count the number of associations per gene\n",
    "gene_summary_df = gene_df.groupby(['gene', 'symbol']).apply(summarize_gene_df)\n",
    "gene_summary_df = gene_summary_df.reset_index()\n",
    "gene_summary_df = gene_summary_df.sort(statuses, ascending=False)\n",
    "gene_summary_df.to_csv('data/genes.tsv', sep='\\t', index=False)\n",
    "gene_summary_df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}