{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# download uniprot ID mapping\n", "#! wget --directory-prefix download ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz\n", "! shasum download/idmapping.dat.gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### idmapping.dat [documentation](ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README)\n", "\n", "1) idmapping.dat\n", "This file has three columns, delimited by tab:\n", "1. UniProtKB-AC \n", "2. ID_type \n", "3. ID\n", "\n", "where ID_type is the database name as appearing in UniProtKB cross-references, \n", "and as supported by the ID mapping tool on the UniProt web site, \n", "http://www.uniprot.org/mapping and where ID is the identifier in \n", "that cross-referenced database." ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "import csv\n", "import io\n", "import gzip" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def generate_idmapping(path):\n", " \"\"\"Returns a generator of idmapping.dat.gz rows.\"\"\"\n", " read_file = gzip.open(path, 'rb')\n", " text = io.TextIOWrapper(read_file)\n", " reader = csv.reader(text, delimiter='\\t')\n", " for row in reader:\n", " yield row\n", " read_file.close()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "path = os.path.join('download', 'idmapping.dat.gz')\n", "mapping_generator = generate_idmapping(path)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [], "source": [ "extract = {'GeneID', 'HGNC'}\n", "mappings = {target: set() for target in extract}\n", "\n", "for accession, target, target_id in mapping_generator:\n", " if target not in extract:\n", " continue\n", " mappings[target].add((accession, target_id))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "for target, mapset in mappings.items():\n", " path = os.path.join('data', 'map', '{}.tsv.gz'.format(target))\n", " write_file = gzip.open(path, 'wb')\n", " wrapper = io.TextIOWrapper(write_file)\n", " writer = csv.writer(wrapper, delimiter='\\t')\n", " writer.writerow(['uniprot', target])\n", " writer.writerows(sorted(mapset))\n", " write_file.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.0" } }, "nbformat": 4, "nbformat_minor": 0 }