{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# download uniprot ID mapping\n",
    "#! wget --directory-prefix download ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz\n",
    "! shasum download/idmapping.dat.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### idmapping.dat [documentation](ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README)\n",
    "\n",
    "1) idmapping.dat\n",
    "This file has three columns, delimited by tab:\n",
    "1. UniProtKB-AC \n",
    "2. ID_type \n",
    "3. ID\n",
    "\n",
    "where ID_type is the database name as appearing in UniProtKB cross-references, \n",
    "and as supported by the ID mapping tool on the UniProt web site, \n",
    "http://www.uniprot.org/mapping and where ID is the identifier in \n",
    "that cross-referenced database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import csv\n",
    "import io\n",
    "import gzip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def generate_idmapping(path):\n",
    "    \"\"\"Returns a generator of idmapping.dat.gz rows.\"\"\"\n",
    "    read_file = gzip.open(path, 'rb')\n",
    "    text = io.TextIOWrapper(read_file)\n",
    "    reader = csv.reader(text, delimiter='\\t')\n",
    "    for row in reader:\n",
    "        yield row\n",
    "    read_file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "path = os.path.join('download', 'idmapping.dat.gz')\n",
    "mapping_generator = generate_idmapping(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "extract = {'GeneID', 'HGNC'}\n",
    "mappings = {target: set() for target in extract}\n",
    "\n",
    "for accession, target, target_id in mapping_generator:\n",
    "    if target not in extract:\n",
    "        continue\n",
    "    mappings[target].add((accession, target_id))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for target, mapset in mappings.items():\n",
    "    path = os.path.join('data', 'map', '{}.tsv.gz'.format(target))\n",
    "    write_file = gzip.open(path, 'wb')\n",
    "    wrapper = io.TextIOWrapper(write_file)\n",
    "    writer = csv.writer(wrapper, delimiter='\\t')\n",
    "    writer.writerow(['uniprot', target])\n",
    "    writer.writerows(sorted(mapset))\n",
    "    write_file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}