{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extract disease-gene associations from the GWAS Catalog\n",
    "\n",
    "Method based on our [previousely reported protocal](https://dx.doi.org/10.1101/011569), but that replaces HapMap linkage data from DAPPLE with 1000 Genomes Pilot data from SNAP:\n",
    "\n",
    "1. Start with the GWAS Catalog produced by the EBI, which included EFO mappings for the disease/trait\n",
    "2. Convert EFO terms into disease ontology terms that are in DO Slim\n",
    "3. Identify SNPs in LD with lead SNPs using a manual SNAP query\n",
    "4. Identify a window for each lead SNP based on the furthest upstream and downstream SNPs with $r^2 \\geq 0.5$.\n",
    "5. For each disease, overlap windows, in order of significance, into loci.\n",
    "6. Classify a loci as high confidence if the sample size was at least 1000 and $p \\leq 5 \\times 10^{-8}$.\n",
    "7. For each loci, identify a primary gene and classify the rest as secondary. Only protein-coding genes are considered."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import re\n",
    "import json\n",
    "import math\n",
    "import collections\n",
    "\n",
    "import requests\n",
    "import numpy\n",
    "import pandas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load GWAS Catalog"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Read EBI's GWAS Catalog with ontology annotations\n",
    "# https://www.ebi.ac.uk/gwas/docs/fileheaders\n",
    "path = 'download/gwas_catalog_v1.0.1-downloaded_2015-06-08.tsv.gz'\n",
    "dtype_dict = {'UPSTREAM_GENE_ID': numpy.character,\n",
    "              'DOWNSTREAM_GENE_ID': numpy.character,\n",
    "              'SNP_GENE_IDS': numpy.character}\n",
    "ebi_df = pandas.read_table(path, compression='gzip', low_memory=False, dtype=dtype_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>DATE ADDED TO CATALOG</th>\n",
       "      <th>PUBMEDID</th>\n",
       "      <th>FIRST AUTHOR</th>\n",
       "      <th>DATE</th>\n",
       "      <th>JOURNAL</th>\n",
       "      <th>LINK</th>\n",
       "      <th>STUDY</th>\n",
       "      <th>DISEASE/TRAIT</th>\n",
       "      <th>INITIAL SAMPLE DESCRIPTION</th>\n",
       "      <th>REPLICATION SAMPLE DESCRIPTION</th>\n",
       "      <th>...</th>\n",
       "      <th>RISK ALLELE FREQUENCY</th>\n",
       "      <th>P-VALUE</th>\n",
       "      <th>PVALUE_MLOG</th>\n",
       "      <th>P-VALUE (TEXT)</th>\n",
       "      <th>OR or BETA</th>\n",
       "      <th>95% CI (TEXT)</th>\n",
       "      <th>PLATFORM [SNPS PASSING QC]</th>\n",
       "      <th>CNV</th>\n",
       "      <th>MAPPED_TRAIT</th>\n",
       "      <th>MAPPED_TRAIT_URI</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>25231870</td>\n",
       "      <td>Perry JR</td>\n",
       "      <td>23-Jul-2014</td>\n",
       "      <td>Nature</td>\n",
       "      <td>http://europepmc.org/abstract/MED/25231870</td>\n",
       "      <td>Parent-of-origin-specific allelic associations...</td>\n",
       "      <td>Menarche (age at onset)</td>\n",
       "      <td>Up to 182,413 European ancestry women</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.47</td>\n",
       "      <td>2.000000e-12</td>\n",
       "      <td>11.698970004336019</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.04</td>\n",
       "      <td>[0.03-0.05] (unit increase)</td>\n",
       "      <td>Illumina &amp; Affymetrix [2,441,815] (imputed)</td>\n",
       "      <td>N</td>\n",
       "      <td>age at menarche</td>\n",
       "      <td>http://www.ebi.ac.uk/efo/EFO_0004703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>25231870</td>\n",
       "      <td>Perry JR</td>\n",
       "      <td>23-Jul-2014</td>\n",
       "      <td>Nature</td>\n",
       "      <td>http://europepmc.org/abstract/MED/25231870</td>\n",
       "      <td>Parent-of-origin-specific allelic associations...</td>\n",
       "      <td>Menarche (age at onset)</td>\n",
       "      <td>Up to 182,413 European ancestry women</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.29</td>\n",
       "      <td>7.000000e-20</td>\n",
       "      <td>19.154901959985743</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.05</td>\n",
       "      <td>[0.038-0.062] (unit increase)</td>\n",
       "      <td>Illumina &amp; Affymetrix [2,441,815] (imputed)</td>\n",
       "      <td>N</td>\n",
       "      <td>age at menarche</td>\n",
       "      <td>http://www.ebi.ac.uk/efo/EFO_0004703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>25231870</td>\n",
       "      <td>Perry JR</td>\n",
       "      <td>23-Jul-2014</td>\n",
       "      <td>Nature</td>\n",
       "      <td>http://europepmc.org/abstract/MED/25231870</td>\n",
       "      <td>Parent-of-origin-specific allelic associations...</td>\n",
       "      <td>Menarche (age at onset)</td>\n",
       "      <td>Up to 182,413 European ancestry women</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.29</td>\n",
       "      <td>7.000000e-20</td>\n",
       "      <td>19.154901959985743</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.05</td>\n",
       "      <td>[0.038-0.062] (unit increase)</td>\n",
       "      <td>Illumina &amp; Affymetrix [2,441,815] (imputed)</td>\n",
       "      <td>N</td>\n",
       "      <td>age at menarche</td>\n",
       "      <td>http://www.ebi.ac.uk/efo/EFO_0004703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>25231870</td>\n",
       "      <td>Perry JR</td>\n",
       "      <td>23-Jul-2014</td>\n",
       "      <td>Nature</td>\n",
       "      <td>http://europepmc.org/abstract/MED/25231870</td>\n",
       "      <td>Parent-of-origin-specific allelic associations...</td>\n",
       "      <td>Menarche (age at onset)</td>\n",
       "      <td>Up to 182,413 European ancestry women</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.46</td>\n",
       "      <td>3.000000e-08</td>\n",
       "      <td>7.522878745280337</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.03</td>\n",
       "      <td>[0.02-0.04] (unit increase)</td>\n",
       "      <td>Illumina &amp; Affymetrix [2,441,815] (imputed)</td>\n",
       "      <td>N</td>\n",
       "      <td>age at menarche</td>\n",
       "      <td>http://www.ebi.ac.uk/efo/EFO_0004703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>25231870</td>\n",
       "      <td>Perry JR</td>\n",
       "      <td>23-Jul-2014</td>\n",
       "      <td>Nature</td>\n",
       "      <td>http://europepmc.org/abstract/MED/25231870</td>\n",
       "      <td>Parent-of-origin-specific allelic associations...</td>\n",
       "      <td>Menarche (age at onset)</td>\n",
       "      <td>Up to 182,413 European ancestry women</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.46</td>\n",
       "      <td>3.000000e-08</td>\n",
       "      <td>7.522878745280337</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.03</td>\n",
       "      <td>[0.02-0.04] (unit increase)</td>\n",
       "      <td>Illumina &amp; Affymetrix [2,441,815] (imputed)</td>\n",
       "      <td>N</td>\n",
       "      <td>age at menarche</td>\n",
       "      <td>http://www.ebi.ac.uk/efo/EFO_0004703</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 36 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  DATE ADDED TO CATALOG  PUBMEDID FIRST AUTHOR         DATE JOURNAL  \\\n",
       "0                   NaN  25231870     Perry JR  23-Jul-2014  Nature   \n",
       "1                   NaN  25231870     Perry JR  23-Jul-2014  Nature   \n",
       "2                   NaN  25231870     Perry JR  23-Jul-2014  Nature   \n",
       "3                   NaN  25231870     Perry JR  23-Jul-2014  Nature   \n",
       "4                   NaN  25231870     Perry JR  23-Jul-2014  Nature   \n",
       "\n",
       "                                         LINK  \\\n",
       "0  http://europepmc.org/abstract/MED/25231870   \n",
       "1  http://europepmc.org/abstract/MED/25231870   \n",
       "2  http://europepmc.org/abstract/MED/25231870   \n",
       "3  http://europepmc.org/abstract/MED/25231870   \n",
       "4  http://europepmc.org/abstract/MED/25231870   \n",
       "\n",
       "                                               STUDY            DISEASE/TRAIT  \\\n",
       "0  Parent-of-origin-specific allelic associations...  Menarche (age at onset)   \n",
       "1  Parent-of-origin-specific allelic associations...  Menarche (age at onset)   \n",
       "2  Parent-of-origin-specific allelic associations...  Menarche (age at onset)   \n",
       "3  Parent-of-origin-specific allelic associations...  Menarche (age at onset)   \n",
       "4  Parent-of-origin-specific allelic associations...  Menarche (age at onset)   \n",
       "\n",
       "              INITIAL SAMPLE DESCRIPTION REPLICATION SAMPLE DESCRIPTION  \\\n",
       "0  Up to 182,413 European ancestry women                            NaN   \n",
       "1  Up to 182,413 European ancestry women                            NaN   \n",
       "2  Up to 182,413 European ancestry women                            NaN   \n",
       "3  Up to 182,413 European ancestry women                            NaN   \n",
       "4  Up to 182,413 European ancestry women                            NaN   \n",
       "\n",
       "                   ...                  RISK ALLELE FREQUENCY       P-VALUE  \\\n",
       "0                  ...                                   0.47  2.000000e-12   \n",
       "1                  ...                                   0.29  7.000000e-20   \n",
       "2                  ...                                   0.29  7.000000e-20   \n",
       "3                  ...                                   0.46  3.000000e-08   \n",
       "4                  ...                                   0.46  3.000000e-08   \n",
       "\n",
       "          PVALUE_MLOG P-VALUE (TEXT) OR or BETA  \\\n",
       "0  11.698970004336019            NaN       0.04   \n",
       "1  19.154901959985743            NaN       0.05   \n",
       "2  19.154901959985743            NaN       0.05   \n",
       "3   7.522878745280337            NaN       0.03   \n",
       "4   7.522878745280337            NaN       0.03   \n",
       "\n",
       "                   95% CI (TEXT)                   PLATFORM [SNPS PASSING QC]  \\\n",
       "0    [0.03-0.05] (unit increase)  Illumina & Affymetrix [2,441,815] (imputed)   \n",
       "1  [0.038-0.062] (unit increase)  Illumina & Affymetrix [2,441,815] (imputed)   \n",
       "2  [0.038-0.062] (unit increase)  Illumina & Affymetrix [2,441,815] (imputed)   \n",
       "3    [0.02-0.04] (unit increase)  Illumina & Affymetrix [2,441,815] (imputed)   \n",
       "4    [0.02-0.04] (unit increase)  Illumina & Affymetrix [2,441,815] (imputed)   \n",
       "\n",
       "  CNV     MAPPED_TRAIT                      MAPPED_TRAIT_URI  \n",
       "0   N  age at menarche  http://www.ebi.ac.uk/efo/EFO_0004703  \n",
       "1   N  age at menarche  http://www.ebi.ac.uk/efo/EFO_0004703  \n",
       "2   N  age at menarche  http://www.ebi.ac.uk/efo/EFO_0004703  \n",
       "3   N  age at menarche  http://www.ebi.ac.uk/efo/EFO_0004703  \n",
       "4   N  age at menarche  http://www.ebi.ac.uk/efo/EFO_0004703  \n",
       "\n",
       "[5 rows x 36 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ebi_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Entrez Gene information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Read entrez symbol to gene mapping\n",
    "url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/symbols-human.json'\n",
    "symbol_to_id = requests.get(url).json()\n",
    "\n",
    "# Read entrez synonym to genes mapping\n",
    "url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/synonyms-human.json'\n",
    "synonym_to_ids = requests.get(url).json()\n",
    "\n",
    "# Read entrez genes\n",
    "url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/genes-human.tsv'\n",
    "entrez_df = pandas.read_table(url)\n",
    "\n",
    "# Read and create hgnc to entrez gene mapping\n",
    "url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/xrefs-human.tsv'\n",
    "hgnc_df = pandas.read_table(url)\n",
    "hgnc_df = hgnc_df[hgnc_df.resource == 'HGNC'][['GeneID', 'identifier']]\n",
    "hgnc_df = hgnc_df.rename(columns={'GeneID': 'gene', 'identifier': 'hgnc_id'})\n",
    "hgnc_df.head()\n",
    "\n",
    "# Create a set of coding genes\n",
    "coding_genes = set(entrez_df.GeneID[entrez_df.type_of_gene == 'protein-coding'].astype(str))\n",
    "len(coding_genes)\n",
    "\n",
    "# Create a symbol dataframe\n",
    "symbol_df = entrez_df[['GeneID', 'Symbol']].rename(columns={'GeneID': 'gene', 'Symbol': 'symbol'})\n",
    "symbol_df = symbol_df.merge(hgnc_df, how='left')\n",
    "symbol_df.gene = symbol_df.gene.astype(str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert from EFO to Disease Ontology"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Create a uri_df (for cross-references)\n",
    "rows = list()\n",
    "for uri in filter(pandas.notnull, set(ebi_df['MAPPED_TRAIT_URI'])):\n",
    "    head, tail = uri.rsplit('/', 1)\n",
    "    resource, resource_id = tail.split('_', 1)\n",
    "    rows.append([uri, resource, resource_id])\n",
    "    \n",
    "uri_df = pandas.DataFrame(rows, columns=['MAPPED_TRAIT_URI', 'resource', 'resource_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Read DO unpropagated cross-references\n",
    "url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/xrefs.tsv'\n",
    "doxref_df = pandas.read_table(url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11194"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Inner join the GWAS catalog with the DO slim mapping \n",
    "map_df = uri_df.merge(doxref_df)\n",
    "map_df = map_df[['MAPPED_TRAIT_URI', 'doid_code', 'doid_name']]\n",
    "ebi_df = ebi_df.merge(map_df)\n",
    "len(ebi_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Identify lead SNPs and proxy search using SNAP\n",
    "\n",
    "The SNAP proxy search must be run manually. Windows are computed in a companion notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5616"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Write all lead SNPs for SNAP input\n",
    "gwas_snps = set('rs{}'.format(x) for x in ebi_df.SNP_ID_CURRENT if pandas.notnull(x))\n",
    "gwas_snps = sorted(gwas_snps)\n",
    "with open('data/snap/do-lead-SNPs.txt', 'w') as write_file:\n",
    "    write_file.write('\\n'.join(gwas_snps))\n",
    "len(gwas_snps)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process associations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "HC_mlogp = -math.log10(5e-8)\n",
    "HC_samples = 1000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def extract_sample_size(text):\n",
    "    pattern_cases = r'([\\d,]+)[^\\d]*?cases'\n",
    "    pattern_contols = r'([\\d,]+)[^\\d]*?controls'\n",
    "    pattern_samples = r'[\\d,]+'\n",
    "    \n",
    "    remove_commas = lambda x: int(x.replace(',', ''))\n",
    "    \n",
    "    sample_size = {'cases': None, 'controls': None, 'samples': None,}\n",
    "    \n",
    "    for key, pattern in ('cases', pattern_cases), ('controls', pattern_contols):\n",
    "        try:\n",
    "            match = re.search(pattern, text)\n",
    "            sample_size[key] = remove_commas(match.group(1))\n",
    "        except (IndexError, AttributeError, ValueError):\n",
    "            continue\n",
    "    \n",
    "    if sample_size['cases'] is not None and sample_size['controls'] is not None:\n",
    "        sample_size['samples'] = sample_size['cases'] + sample_size['controls']\n",
    "    \n",
    "    if sample_size['cases'] is None and sample_size['controls'] is None:\n",
    "        try:\n",
    "            match = re.search(pattern_samples, text)\n",
    "            sample_size['samples'] = remove_commas(match.group(0))\n",
    "        except (IndexError, AttributeError, ValueError):\n",
    "            pass\n",
    "    \n",
    "    return sample_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def process_reported_symbols(text):\n",
    "    if pandas.isnull(text):\n",
    "        return set()\n",
    "    symbols = set(re.split(r'[, ]+', text))\n",
    "    symbols -= {'Intergenic', 'NR', 'Pending'}\n",
    "    entrez_ids = set()\n",
    "    for symbol in symbols:\n",
    "        gene_id = symbol_to_id.get(symbol)\n",
    "        if not gene_id:\n",
    "            gene_ids = synonym_to_ids.get(symbol, [])\n",
    "            if len(gene_ids) != 1:\n",
    "                continue\n",
    "            gene_id, = gene_ids\n",
    "        entrez_ids.add(str(gene_id))\n",
    "    return entrez_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>doid_code</th>\n",
       "      <th>doid_name</th>\n",
       "      <th>lead_snp</th>\n",
       "      <th>reported_symbols</th>\n",
       "      <th>mlog_pvalue</th>\n",
       "      <th>date</th>\n",
       "      <th>pubmed_id</th>\n",
       "      <th>upstream_gene</th>\n",
       "      <th>downstream_gene</th>\n",
       "      <th>containing_genes</th>\n",
       "      <th>reported_genes</th>\n",
       "      <th>cases</th>\n",
       "      <th>controls</th>\n",
       "      <th>samples</th>\n",
       "      <th>high_confidence</th>\n",
       "      <th>lead_chrom</th>\n",
       "      <th>lower_coord</th>\n",
       "      <th>upper_coord</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>DOID:1067</td>\n",
       "      <td>open-angle glaucoma</td>\n",
       "      <td>rs2487032</td>\n",
       "      <td>ABCA1</td>\n",
       "      <td>13.154902</td>\n",
       "      <td>2014-08-31</td>\n",
       "      <td>25173107</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>set([])</td>\n",
       "      <td>set([19])</td>\n",
       "      <td>966</td>\n",
       "      <td>1005</td>\n",
       "      <td>1971</td>\n",
       "      <td>1</td>\n",
       "      <td>chr9</td>\n",
       "      <td>106731644</td>\n",
       "      <td>106747174</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DOID:1067</td>\n",
       "      <td>open-angle glaucoma</td>\n",
       "      <td>rs3785176</td>\n",
       "      <td>PMM2</td>\n",
       "      <td>6.698970</td>\n",
       "      <td>2014-08-31</td>\n",
       "      <td>25173107</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>set([])</td>\n",
       "      <td>set([5373])</td>\n",
       "      <td>966</td>\n",
       "      <td>1005</td>\n",
       "      <td>1971</td>\n",
       "      <td>0</td>\n",
       "      <td>chr16</td>\n",
       "      <td>8804432</td>\n",
       "      <td>8854102</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>DOID:1067</td>\n",
       "      <td>open-angle glaucoma</td>\n",
       "      <td>rs4977756</td>\n",
       "      <td>CDKN2B-AS1</td>\n",
       "      <td>29.154902</td>\n",
       "      <td>2014-08-31</td>\n",
       "      <td>25173105</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>set([100048912])</td>\n",
       "      <td>set([100048912])</td>\n",
       "      <td>1155</td>\n",
       "      <td>1922</td>\n",
       "      <td>3077</td>\n",
       "      <td>1</td>\n",
       "      <td>chr9</td>\n",
       "      <td>21993367</td>\n",
       "      <td>22111349</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>DOID:1067</td>\n",
       "      <td>open-angle glaucoma</td>\n",
       "      <td>rs4977756</td>\n",
       "      <td>CDKN2B-AS1</td>\n",
       "      <td>29.154902</td>\n",
       "      <td>2014-08-31</td>\n",
       "      <td>25173105</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>set([102724137])</td>\n",
       "      <td>set([100048912])</td>\n",
       "      <td>1155</td>\n",
       "      <td>1922</td>\n",
       "      <td>3077</td>\n",
       "      <td>1</td>\n",
       "      <td>chr9</td>\n",
       "      <td>21993367</td>\n",
       "      <td>22111349</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>DOID:1686</td>\n",
       "      <td>glaucoma</td>\n",
       "      <td>rs4977756</td>\n",
       "      <td>CDKN2B-AS1</td>\n",
       "      <td>14.000000</td>\n",
       "      <td>2011-05-01</td>\n",
       "      <td>21532571</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>set([100048912])</td>\n",
       "      <td>set([100048912])</td>\n",
       "      <td>590</td>\n",
       "      <td>3956</td>\n",
       "      <td>4546</td>\n",
       "      <td>1</td>\n",
       "      <td>chr9</td>\n",
       "      <td>21993367</td>\n",
       "      <td>22111349</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   doid_code            doid_name   lead_snp reported_symbols  mlog_pvalue  \\\n",
       "0  DOID:1067  open-angle glaucoma  rs2487032            ABCA1    13.154902   \n",
       "1  DOID:1067  open-angle glaucoma  rs3785176             PMM2     6.698970   \n",
       "2  DOID:1067  open-angle glaucoma  rs4977756       CDKN2B-AS1    29.154902   \n",
       "3  DOID:1067  open-angle glaucoma  rs4977756       CDKN2B-AS1    29.154902   \n",
       "4  DOID:1686             glaucoma  rs4977756       CDKN2B-AS1    14.000000   \n",
       "\n",
       "        date  pubmed_id upstream_gene downstream_gene  containing_genes  \\\n",
       "0 2014-08-31   25173107           NaN             NaN           set([])   \n",
       "1 2014-08-31   25173107           NaN             NaN           set([])   \n",
       "2 2014-08-31   25173105           NaN             NaN  set([100048912])   \n",
       "3 2014-08-31   25173105           NaN             NaN  set([102724137])   \n",
       "4 2011-05-01   21532571           NaN             NaN  set([100048912])   \n",
       "\n",
       "     reported_genes  cases  controls  samples  high_confidence lead_chrom  \\\n",
       "0         set([19])    966      1005     1971                1       chr9   \n",
       "1       set([5373])    966      1005     1971                0      chr16   \n",
       "2  set([100048912])   1155      1922     3077                1       chr9   \n",
       "3  set([100048912])   1155      1922     3077                1       chr9   \n",
       "4  set([100048912])    590      3956     4546                1       chr9   \n",
       "\n",
       "   lower_coord  upper_coord  \n",
       "0    106731644    106747174  \n",
       "1      8804432      8854102  \n",
       "2     21993367     22111349  \n",
       "3     21993367     22111349  \n",
       "4     21993367     22111349  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# rename and order columns\n",
    "rename_dict = {'SNP_ID_CURRENT': 'lead_snp', 'REPORTED GENE(S)': 'symbols', 'PUBMEDID': 'pubmed_id',\n",
    "              'INITIAL SAMPLE DESCRIPTION': 'sample_description', 'PVALUE_MLOG': 'mlog_pvalue',\n",
    "              'DATE': 'date', 'REPORTED GENE(S)': 'reported_symbols', 'UPSTREAM_GENE_ID': 'upstream_gene',\n",
    "              'DOWNSTREAM_GENE_ID': 'downstream_gene', 'SNP_GENE_IDS': 'containing_genes'}\n",
    "columns = ['doid_code', 'doid_name'] + list(rename_dict.values())\n",
    "association_df = ebi_df.rename(columns = rename_dict)[columns]\n",
    "\n",
    "# split containing genes\n",
    "association_df.containing_genes = association_df.containing_genes.map(\n",
    "    lambda x: set(x.split(';')) if pandas.notnull(x) else set())\n",
    "\n",
    "# convert reported symbols to entrez GeneIDs\n",
    "association_df['reported_genes'] = association_df.reported_symbols.map(process_reported_symbols)\n",
    "\n",
    "# convert dates\n",
    "association_df.date = pandas.to_datetime(association_df.date)\n",
    "\n",
    "# Add rs to SNP names\n",
    "association_df.lead_snp = 'rs' + association_df.lead_snp\n",
    "\n",
    "# extract sample size information\n",
    "sample_size_df = pandas.DataFrame(list(map(extract_sample_size, association_df.sample_description)))\n",
    "association_df = association_df.drop(['sample_description'], axis=1)\n",
    "association_df = pandas.concat([association_df, sample_size_df], axis=1)\n",
    "\n",
    "# convert mlog_pavlue to numeric\n",
    "association_df.mlog_pvalue = association_df.mlog_pvalue.convert_objects(convert_numeric=True)\n",
    "association_df['high_confidence'] = (\n",
    "    (association_df.mlog_pvalue >= HC_mlogp) &\n",
    "    (association_df.samples >= HC_samples)).astype(int)\n",
    "\n",
    "# Add window information\n",
    "window_df = pandas.read_table('data/snap/windows.tsv')\n",
    "association_df = association_df.merge(window_df[['lead_snp', 'lead_chrom', 'lower_coord', 'upper_coord']])\n",
    "\n",
    "association_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Identify disease-specific loci"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def identify_loci(disease_df):\n",
    "    disease_df = disease_df.sort(['high_confidence', 'mlog_pvalue', 'date'], ascending=False)\n",
    "    \n",
    "    loci = dict()\n",
    "    \n",
    "    for i, row in disease_df.iterrows():\n",
    "        interval = row.lead_chrom, row.lower_coord, row.upper_coord\n",
    "        for chrom, lower, upper in loci:\n",
    "            if interval[0] == chrom and (\n",
    "                (lower <= interval[1] and interval[1] <= upper) or\n",
    "                (lower <= interval[2] and interval[2] <= upper)):\n",
    "                loci[(chrom, lower, upper)].append(row)\n",
    "                break\n",
    "        else:\n",
    "            loci[interval] = [row]\n",
    "    \n",
    "    # convert each locus to a dataframe\n",
    "    loci = list(map(pandas.DataFrame, loci.values()))\n",
    "\n",
    "    for i, locus_df in enumerate(loci):\n",
    "        locus_df['locus'] = i\n",
    "    \n",
    "    return pandas.concat(loci)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "association_df = association_df.groupby('doid_code', as_index=False).apply(identify_loci)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Save associations to tsv\n",
    "association_write_df = association_df.copy()\n",
    "for column in ['reported_genes', 'containing_genes']:\n",
    "    association_write_df[column] = association_write_df[column].map(lambda x: '|'.join(x))\n",
    "association_write_df.to_csv('data/snp-associations.tsv', sep='\\t', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Annotate genes to loci "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def max_counter_keys(counter, key_subset=None):\n",
    "    if key_subset is not None:\n",
    "        counter = collections.Counter({k: v for\n",
    "            k, v in counter.items() if k in key_subset})\n",
    "    max_value = max(list(counter.values()) + [0])\n",
    "    max_keys = [k for k, v in counter.items() if v == max_value]\n",
    "    return max_keys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def associate_by_gene(locus_df):\n",
    "        \n",
    "    # sort associations by precedence\n",
    "    locus_df = locus_df.sort(['high_confidence', 'mlog_pvalue', 'date'], ascending=False)\n",
    "    \n",
    "    # studies that report the locus\n",
    "    studies = list(locus_df.pubmed_id.drop_duplicates())\n",
    "    \n",
    "    # Count the number of times each gene is reported across all studies in a loci\n",
    "    reported_counts = collections.Counter()\n",
    "    for genes in locus_df.reported_genes:\n",
    "        reported_counts.update(genes)\n",
    "    \n",
    "    primary_gene = None\n",
    "    \n",
    "    # Identify a single mode reported gene\n",
    "    mode_reported = max_counter_keys(reported_counts, coding_genes)\n",
    "    if len(mode_reported) == 1:\n",
    "        primary_gene, = mode_reported\n",
    "    \n",
    "    # Iterate through associations attempting to resolve primary-gene ambiguity\n",
    "    for i, row in locus_df.iterrows():\n",
    "        if primary_gene:\n",
    "            break\n",
    "\n",
    "        # Find containing genes\n",
    "        containing_genes = row.containing_genes & coding_genes\n",
    "        \n",
    "        # If a lead SNP is in a gene, consider the containing gene primary\n",
    "        if len(containing_genes) == 1:\n",
    "            primary_gene, = containing_genes\n",
    "            break\n",
    "        \n",
    "        # If the lead SNP is in multiple genes, take the mode reported of those genes\n",
    "        mode_reported = max_counter_keys(reported_counts, containing_genes)\n",
    "        if len(mode_reported) == 1:\n",
    "            primary_gene, = mode_reported\n",
    "            break\n",
    "\n",
    "        # Take the more reported gene from the upstream and downstream gene\n",
    "        stream_genes = {gene for gene in [row.downstream_gene, row.upstream_gene] if pandas.notnull(gene)}\n",
    "        mode_reported = max_counter_keys(reported_counts, coding_genes & stream_genes)\n",
    "        if len(mode_reported) == 1:\n",
    "            primary_gene, = mode_reported\n",
    "            break\n",
    "\n",
    "    # All genes except the primary are considered secondary\n",
    "    secondary_genes = set()\n",
    "    for column in ['reported_genes', 'containing_genes']:\n",
    "        for genes in locus_df[column]:\n",
    "            secondary_genes |= genes\n",
    "    secondary_genes |= set(locus_df.upstream_gene)\n",
    "    secondary_genes |= set(locus_df.downstream_gene)\n",
    "    secondary_genes = set(filter(pandas.notnull, secondary_genes))\n",
    "    secondary_genes.discard(primary_gene)\n",
    "    secondary_genes &= coding_genes\n",
    "    \n",
    "    # Create a dataframe to return\n",
    "    columns = ['doid_code', 'doid_name', 'locus', 'high_confidence', 'pubmed_ids', 'primary', 'gene']\n",
    "    rows = list()\n",
    "    first_row = locus_df.iloc[0]\n",
    "    base_row = first_row.doid_code, first_row.doid_name, first_row.locus, first_row.high_confidence, studies\n",
    "    if primary_gene:\n",
    "        rows.append(base_row + (1, primary_gene))\n",
    "    for secondary_gene in secondary_genes:\n",
    "        rows.append(base_row + (0, secondary_gene))\n",
    "    gene_df = pandas.DataFrame(rows, columns=columns)\n",
    "    \n",
    "    return gene_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "grouped = association_df.groupby(['doid_code', 'locus'], as_index=False)\n",
    "gene_df = grouped.apply(associate_by_gene)\n",
    "for column in 'locus', 'high_confidence', 'primary':\n",
    "    gene_df[column] = gene_df[column].astype(int)\n",
    "status = gene_df.apply(lambda x: '{}C-{}'.format('H' if x.high_confidence else 'L', 'P' if x.primary else 'S'), axis=1)\n",
    "gene_df.insert(5, 'status', status)\n",
    "gene_df = gene_df.merge(symbol_df, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'284 associations removed that were duplicated across statuses'"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Remove duplicated associations \n",
    "associations_with_dups = len(gene_df)\n",
    "\n",
    "def remove_duplicated_gene_associations(df):\n",
    "    df = df.sort(['high_confidence', 'primary'], ascending=False)\n",
    "    series = df.iloc[0]\n",
    "    studies = list()\n",
    "    for pubmed_ids in df.pubmed_ids:\n",
    "        studies.extend(pubmed_ids)\n",
    "    series['pubmed_ids'] = '|'.join(map(str, pandas.Series(studies).drop_duplicates()))\n",
    "    return series\n",
    "\n",
    "gene_df = gene_df.groupby(['doid_code', 'gene']).apply(remove_duplicated_gene_associations)\n",
    "gene_df = gene_df.reset_index(drop=True)\n",
    "gene_df = gene_df.sort(['doid_code', 'high_confidence', 'primary'], ascending=False)\n",
    "'{} associations removed that were duplicated across statuses'.format(associations_with_dups - len(gene_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>doid_code</th>\n",
       "      <th>doid_name</th>\n",
       "      <th>locus</th>\n",
       "      <th>high_confidence</th>\n",
       "      <th>pubmed_ids</th>\n",
       "      <th>status</th>\n",
       "      <th>primary</th>\n",
       "      <th>gene</th>\n",
       "      <th>symbol</th>\n",
       "      <th>hgnc_id</th>\n",
       "      <th>hgnc_id_int</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6485</th>\n",
       "      <td>DOID:9970</td>\n",
       "      <td>obesity</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>22484627</td>\n",
       "      <td>HC-P</td>\n",
       "      <td>1</td>\n",
       "      <td>10562</td>\n",
       "      <td>OLFM4</td>\n",
       "      <td>HGNC:17190</td>\n",
       "      <td>17190</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6488</th>\n",
       "      <td>DOID:9970</td>\n",
       "      <td>obesity</td>\n",
       "      <td>51</td>\n",
       "      <td>1</td>\n",
       "      <td>19557161</td>\n",
       "      <td>HC-P</td>\n",
       "      <td>1</td>\n",
       "      <td>127018</td>\n",
       "      <td>LYPLAL1</td>\n",
       "      <td>HGNC:20440</td>\n",
       "      <td>20440</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6489</th>\n",
       "      <td>DOID:9970</td>\n",
       "      <td>obesity</td>\n",
       "      <td>54</td>\n",
       "      <td>1</td>\n",
       "      <td>23563609|20421936</td>\n",
       "      <td>HC-P</td>\n",
       "      <td>1</td>\n",
       "      <td>129787</td>\n",
       "      <td>TMEM18</td>\n",
       "      <td>HGNC:25257</td>\n",
       "      <td>25257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6500</th>\n",
       "      <td>DOID:9970</td>\n",
       "      <td>obesity</td>\n",
       "      <td>52</td>\n",
       "      <td>1</td>\n",
       "      <td>23563609</td>\n",
       "      <td>HC-P</td>\n",
       "      <td>1</td>\n",
       "      <td>257194</td>\n",
       "      <td>NEGR1</td>\n",
       "      <td>HGNC:17302</td>\n",
       "      <td>17302</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6505</th>\n",
       "      <td>DOID:9970</td>\n",
       "      <td>obesity</td>\n",
       "      <td>59</td>\n",
       "      <td>1</td>\n",
       "      <td>22484627</td>\n",
       "      <td>HC-P</td>\n",
       "      <td>1</td>\n",
       "      <td>3215</td>\n",
       "      <td>HOXB5</td>\n",
       "      <td>HGNC:5116</td>\n",
       "      <td>5116</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      doid_code doid_name  locus  high_confidence         pubmed_ids status  \\\n",
       "6485  DOID:9970   obesity      2                1           22484627   HC-P   \n",
       "6488  DOID:9970   obesity     51                1           19557161   HC-P   \n",
       "6489  DOID:9970   obesity     54                1  23563609|20421936   HC-P   \n",
       "6500  DOID:9970   obesity     52                1           23563609   HC-P   \n",
       "6505  DOID:9970   obesity     59                1           22484627   HC-P   \n",
       "\n",
       "      primary    gene   symbol     hgnc_id hgnc_id_int  \n",
       "6485        1   10562    OLFM4  HGNC:17190       17190  \n",
       "6488        1  127018  LYPLAL1  HGNC:20440       20440  \n",
       "6489        1  129787   TMEM18  HGNC:25257       25257  \n",
       "6500        1  257194    NEGR1  HGNC:17302       17302  \n",
       "6505        1    3215    HOXB5   HGNC:5116        5116  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# save gene_df as a tsv\n",
    "gene_df['hgnc_id_int'] = gene_df.hgnc_id.map(lambda x: x if pandas.isnull(x) else x.split(':')[1])\n",
    "gene_df.to_csv('data/gene-associations.tsv', sep='\\t', index=False)\n",
    "gene_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Summary statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "statuses = ['HC-P', 'HC-S', 'LC-P', 'LC-S']\n",
    "\n",
    "def summarize_gene_df(df):\n",
    "    counter = collections.Counter(df.status)\n",
    "    series = pandas.Series()\n",
    "    for status in statuses:\n",
    "        series[status] = counter[status]\n",
    "    return series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>doid_code</th>\n",
       "      <th>doid_name</th>\n",
       "      <th>HC-P</th>\n",
       "      <th>HC-S</th>\n",
       "      <th>LC-P</th>\n",
       "      <th>LC-S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>DOID:8778</td>\n",
       "      <td>Crohn's disease</td>\n",
       "      <td>86</td>\n",
       "      <td>131</td>\n",
       "      <td>10</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>DOID:0050589</td>\n",
       "      <td>inflammatory bowel disease</td>\n",
       "      <td>84</td>\n",
       "      <td>194</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>88</th>\n",
       "      <td>DOID:5419</td>\n",
       "      <td>schizophrenia</td>\n",
       "      <td>79</td>\n",
       "      <td>236</td>\n",
       "      <td>76</td>\n",
       "      <td>57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>DOID:7148</td>\n",
       "      <td>rheumatoid arthritis</td>\n",
       "      <td>79</td>\n",
       "      <td>43</td>\n",
       "      <td>48</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>DOID:9352</td>\n",
       "      <td>type 2 diabetes mellitus</td>\n",
       "      <td>67</td>\n",
       "      <td>31</td>\n",
       "      <td>52</td>\n",
       "      <td>29</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        doid_code                   doid_name  HC-P  HC-S  LC-P  LC-S\n",
       "109     DOID:8778             Crohn's disease    86   131    10    13\n",
       "2    DOID:0050589  inflammatory bowel disease    84   194     0     0\n",
       "88      DOID:5419               schizophrenia    79   236    76    57\n",
       "100     DOID:7148        rheumatoid arthritis    79    43    48    30\n",
       "117     DOID:9352    type 2 diabetes mellitus    67    31    52    29"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Count the number of associations per disease\n",
    "disease_summary_df = gene_df.groupby(['doid_code', 'doid_name']).apply(summarize_gene_df)\n",
    "disease_summary_df = disease_summary_df.reset_index()\n",
    "disease_summary_df = disease_summary_df.sort(statuses, ascending=False)\n",
    "disease_summary_df.to_csv('data/diseases.tsv', sep='\\t', index=False)\n",
    "disease_summary_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gene</th>\n",
       "      <th>symbol</th>\n",
       "      <th>HC-P</th>\n",
       "      <th>HC-S</th>\n",
       "      <th>LC-P</th>\n",
       "      <th>LC-S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3218</th>\n",
       "      <td>7015</td>\n",
       "      <td>TERT</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>10019</td>\n",
       "      <td>SH2B3</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2794</th>\n",
       "      <td>5771</td>\n",
       "      <td>PTPN2</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3159</th>\n",
       "      <td>6775</td>\n",
       "      <td>STAT4</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2699</th>\n",
       "      <td>57178</td>\n",
       "      <td>ZMIZ1</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       gene symbol  HC-P  HC-S  LC-P  LC-S\n",
       "3218   7015   TERT     8     1     0     0\n",
       "17    10019  SH2B3     8     0     0     0\n",
       "2794   5771  PTPN2     8     0     0     0\n",
       "3159   6775  STAT4     8     0     0     0\n",
       "2699  57178  ZMIZ1     7     0     5     0"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Count the number of associations per gene\n",
    "gene_summary_df = gene_df.groupby(['gene', 'symbol']).apply(summarize_gene_df)\n",
    "gene_summary_df = gene_summary_df.reset_index()\n",
    "gene_summary_df = gene_summary_df.sort(statuses, ascending=False)\n",
    "gene_summary_df.to_csv('data/genes.tsv', sep='\\t', index=False)\n",
    "gene_summary_df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}