{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Parse the BindingDB tsv export\n",
    "\n",
    "This notebook peforms the following processing steps on the [BindingDB](http://www.bindingdb.org/bind/index.jsp) export:\n",
    "\n",
    "+ processes affinities to floats\n",
    "+ converts to entrez genes\n",
    "+ simplifies observation into essential fields\n",
    "\n",
    "See the corresponding [Thinklab discussion](http://doi.org/10.15363/thinklab.d53) for more information."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import csv\n",
    "import gzip\n",
    "import pprint\n",
    "import collections\n",
    "import operator\n",
    "\n",
    "import pandas\n",
    "import requests"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download BindingDB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9dae0b2175a1ac22b11733e2b9343a7efec7936e  download/BindingDB_All_2015m10.tsv.gz\r\n"
     ]
    }
   ],
   "source": [
    "# Download all data from BindingDB\n",
    "filename = 'BindingDB_All_2015m10.tsv'\n",
    "# ! wget --directory-prefix download https://www.bindingdb.org/bind/downloads/{filename}.zip\n",
    "# ! unzip -d download download/{filename}.zip\n",
    "# ! rm download/{filename}.zip\n",
    "# ! mv download/BindingDB_All.tsv download/{filename}\n",
    "# ! gzip -f download/{filename}\n",
    "! shasum download/{filename}.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load uniprot to entrez gene mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# uniprot to entrez gene mapping\n",
    "url = 'https://github.com/dhimmel/uniprot/raw/5fc60158364d2caf6d4087dad5abba0e8b2ea7db/data/map/GeneID.tsv.gz'\n",
    "uniprot_df = pandas.read_table(url, compression='gzip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "uniprot_to_entrez = dict()\n",
    "for uniprot, entrez in zip(uniprot_df.uniprot, uniprot_df.GeneID):\n",
    "    uniprot_to_entrez.setdefault(uniprot, set()).add(str(entrez))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read and process BindingDB tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "target_fields = [\n",
    "    'BindingDB Target Chain  Sequence',\n",
    "    'PDB ID(s) of Target Chain',\n",
    "    'UniProt (SwissProt) Recommended Name of Target Chain',\n",
    "    'UniProt (SwissProt) Entry Name of Target Chain',\n",
    "    'UniProt (SwissProt) Primary ID of Target Chain',\n",
    "    'UniProt (SwissProt) Secondary ID(s) of Target Chain',\n",
    "    'UniProt (SwissProt) Alternative ID(s) of Target Chain',\n",
    "    'UniProt (TrEMBL) Submitted Name of Target Chain',\n",
    "    'UniProt (TrEMBL) Entry Name of Target Chain',\n",
    "    'UniProt (TrEMBL) Primary ID of Target Chain',\n",
    "    'UniProt (TrEMBL) Secondary ID(s) of Target Chain',\n",
    "    'UniProt (TrEMBL) Alternative ID(s) of Target Chain',\n",
    "]\n",
    "\n",
    "chains_key = 'Number of Protein Chains in Target (>1 implies a multichain complex)'\n",
    "\n",
    "def read_bindingdb(path, verbose=False, max_rows=None):\n",
    "    \"\"\"\n",
    "    Field documentation: https://www.bindingdb.org/bind/chemsearch/marvin/BindingDB-TSV-Format.pdf\n",
    "    \"\"\"\n",
    "    read_file = gzip.open(path, 'rt')\n",
    "    reader = csv.reader(read_file, delimiter='\\t')\n",
    "    header = next(reader)\n",
    "    chains_index = header.index(chains_key)\n",
    "    target0_index = chains_index + 1\n",
    "    ligand_fields = header[:chains_index + 1]\n",
    "    for j, row in enumerate(reader):\n",
    "        if max_rows is not None and j == max_rows:\n",
    "            break\n",
    "        row = [x if x else None for x in row]\n",
    "        ligand_values = row[:chains_index + 1]\n",
    "        # Ensure line has sufficient ligand fields\n",
    "        if len(row) < chains_index + 1:\n",
    "            if verbose:\n",
    "                print('Line', j + 2, 'is deficient')\n",
    "            continue\n",
    "        rowdict = collections.OrderedDict(zip(ligand_fields, ligand_values))\n",
    "        for key in [chains_key]:\n",
    "            if key not in rowdict:\n",
    "                print(j+2)\n",
    "                print(row)\n",
    "                print(rowdict)\n",
    "            rowdict[key] = int(rowdict[key])\n",
    "        chains = list()\n",
    "        assert rowdict[chains_key] == len(row[target0_index:]) / len(target_fields)\n",
    "        for i in range(rowdict[chains_key]):\n",
    "            i_0 = target0_index + i * len(target_fields)\n",
    "            i_1 = target0_index + (i + 1) * len(target_fields)\n",
    "            target_values = row[i_0:i_1]\n",
    "            chain = collections.OrderedDict(zip(target_fields, target_values))\n",
    "            chains.append(chain)\n",
    "        rowdict['chains'] = chains\n",
    "        yield rowdict\n",
    "    read_file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Line 192304 is deficient\n",
      "Line 192305 is deficient\n",
      "Line 192306 is deficient\n",
      "Line 192307 is deficient\n",
      "Line 192308 is deficient\n",
      "Line 192309 is deficient\n",
      "Line 192310 is deficient\n",
      "Line 192311 is deficient\n",
      "Line 192312 is deficient\n",
      "Line 192313 is deficient\n",
      "Line 192314 is deficient\n",
      "Line 192315 is deficient\n",
      "Line 192316 is deficient\n",
      "Line 192317 is deficient\n",
      "Line 192318 is deficient\n",
      "Line 192319 is deficient\n",
      "Line 192320 is deficient\n",
      "Line 192321 is deficient\n",
      "Line 192322 is deficient\n",
      "Line 192323 is deficient\n",
      "Line 192324 is deficient\n",
      "Line 192325 is deficient\n",
      "Line 192326 is deficient\n",
      "Line 192327 is deficient\n",
      "Line 192328 is deficient\n",
      "Line 192329 is deficient\n",
      "Line 192330 is deficient\n",
      "Line 192331 is deficient\n",
      "Line 192332 is deficient\n",
      "Line 192333 is deficient\n",
      "Line 192334 is deficient\n",
      "Line 192335 is deficient\n",
      "Line 192336 is deficient\n",
      "Line 192337 is deficient\n",
      "Line 192338 is deficient\n",
      "Line 192339 is deficient\n",
      "Line 192340 is deficient\n",
      "Line 192341 is deficient\n",
      "Line 192342 is deficient\n",
      "Line 192343 is deficient\n",
      "Line 192344 is deficient\n",
      "Line 192345 is deficient\n",
      "Line 192346 is deficient\n",
      "Line 192347 is deficient\n",
      "Line 192348 is deficient\n",
      "Line 192349 is deficient\n",
      "Line 192350 is deficient\n",
      "Line 192351 is deficient\n",
      "Line 192352 is deficient\n",
      "Line 192353 is deficient\n",
      "Line 192354 is deficient\n",
      "Line 192355 is deficient\n",
      "Line 192356 is deficient\n",
      "Line 192357 is deficient\n",
      "Line 192358 is deficient\n",
      "Line 192359 is deficient\n",
      "Line 192360 is deficient\n",
      "Line 192361 is deficient\n",
      "Line 192362 is deficient\n",
      "Line 192363 is deficient\n",
      "Line 192364 is deficient\n",
      "Line 192365 is deficient\n",
      "Line 192366 is deficient\n",
      "Line 192367 is deficient\n",
      "Line 192368 is deficient\n",
      "Line 192369 is deficient\n",
      "Line 192370 is deficient\n",
      "Line 192371 is deficient\n",
      "Line 192372 is deficient\n",
      "Line 192373 is deficient\n",
      "Line 192374 is deficient\n",
      "Line 192375 is deficient\n",
      "Line 192376 is deficient\n",
      "Line 192377 is deficient\n",
      "Line 192378 is deficient\n",
      "Line 192379 is deficient\n",
      "Line 192380 is deficient\n",
      "Line 192381 is deficient\n",
      "Line 192382 is deficient\n",
      "Line 192383 is deficient\n",
      "Line 192384 is deficient\n",
      "Line 192385 is deficient\n",
      "Line 192386 is deficient\n",
      "Line 192387 is deficient\n",
      "Line 192388 is deficient\n",
      "Line 192389 is deficient\n",
      "Line 192390 is deficient\n",
      "Line 192391 is deficient\n",
      "Line 192392 is deficient\n",
      "Line 192393 is deficient\n",
      "Line 192394 is deficient\n",
      "Line 192395 is deficient\n",
      "Line 192396 is deficient\n",
      "Line 192397 is deficient\n",
      "Line 192398 is deficient\n",
      "Line 192399 is deficient\n",
      "Line 192400 is deficient\n",
      "Line 192401 is deficient\n",
      "Line 192402 is deficient\n",
      "Line 192403 is deficient\n",
      "Line 192404 is deficient\n",
      "Line 192405 is deficient\n",
      "Line 192406 is deficient\n",
      "Line 192407 is deficient\n",
      "Line 192408 is deficient\n",
      "Line 192409 is deficient\n",
      "Line 192410 is deficient\n",
      "Line 192411 is deficient\n",
      "Line 192412 is deficient\n",
      "Line 192413 is deficient\n",
      "Line 192414 is deficient\n",
      "Line 192415 is deficient\n",
      "Line 192416 is deficient\n",
      "Line 192417 is deficient\n",
      "Line 192418 is deficient\n",
      "Line 192419 is deficient\n",
      "Line 192420 is deficient\n",
      "Line 192421 is deficient\n",
      "Line 192422 is deficient\n",
      "Line 192423 is deficient\n",
      "Line 192424 is deficient\n",
      "Line 192425 is deficient\n",
      "Line 192426 is deficient\n",
      "Line 192427 is deficient\n",
      "Line 192428 is deficient\n",
      "Line 192429 is deficient\n",
      "Line 192430 is deficient\n",
      "Line 192431 is deficient\n",
      "Line 192432 is deficient\n",
      "Line 192433 is deficient\n",
      "Line 192434 is deficient\n",
      "Line 192435 is deficient\n",
      "Line 192436 is deficient\n",
      "Line 192437 is deficient\n",
      "Line 192438 is deficient\n",
      "Line 192439 is deficient\n",
      "Line 192440 is deficient\n",
      "Line 192441 is deficient\n",
      "Line 192442 is deficient\n",
      "Line 192443 is deficient\n",
      "Line 192444 is deficient\n",
      "Line 192445 is deficient\n",
      "Line 192446 is deficient\n",
      "Line 192447 is deficient\n",
      "Line 192448 is deficient\n",
      "Line 192449 is deficient\n",
      "Line 192450 is deficient\n",
      "Line 192451 is deficient\n",
      "Line 192452 is deficient\n",
      "Line 192453 is deficient\n",
      "Line 192454 is deficient\n",
      "Line 192455 is deficient\n",
      "Line 192456 is deficient\n",
      "Line 192457 is deficient\n",
      "Line 192458 is deficient\n",
      "Line 192459 is deficient\n",
      "Line 192460 is deficient\n",
      "Line 192461 is deficient\n",
      "Line 192462 is deficient\n",
      "Line 192463 is deficient\n",
      "Line 192464 is deficient\n",
      "Line 192465 is deficient\n",
      "Line 192466 is deficient\n",
      "Line 192467 is deficient\n",
      "Line 192468 is deficient\n",
      "Line 192469 is deficient\n",
      "Line 192470 is deficient\n",
      "Line 192471 is deficient\n",
      "Line 192472 is deficient\n",
      "Line 192473 is deficient\n"
     ]
    }
   ],
   "source": [
    "path = os.path.join('download', filename + '.gz')\n",
    "bindingdb_generator = read_bindingdb(path, verbose=True)\n",
    "\n",
    "bindings = list()\n",
    "for i, row in enumerate(bindingdb_generator):\n",
    "    #if i > 10000:\n",
    "    #    break\n",
    "    if len(row['chains']) != 1:\n",
    "        continue\n",
    "    chain, = row['chains']\n",
    "    uniprots = chain['UniProt (SwissProt) Primary ID of Target Chain']\n",
    "    if not uniprots:\n",
    "        continue\n",
    "    uniprots = uniprots.split(',')\n",
    "\n",
    "    template = dict()\n",
    "    template['bindingdb_id'] = row['BindingDB MonomerID']\n",
    "    template['reaction_id'] = row['BindingDB Reactant_set_id']\n",
    "    template['source'] = row['Curation/DataSource']\n",
    "    template['organism'] = row['Target Source Organism According to Curator or DataSource']\n",
    "    template['pubmed'] = row['PMID']\n",
    "    template['doi'] = row['Article DOI']\n",
    "\n",
    "    affinities = {'Ki': row['Ki (nM)'], 'Kd': row['Kd (nM)'], 'IC50': row['IC50 (nM)']}\n",
    "    for measure, affinity in affinities.items():\n",
    "        if affinity is None:\n",
    "            continue\n",
    "        for uniprot in uniprots:\n",
    "            entrez_set = uniprot_to_entrez.get(uniprot)\n",
    "            if not entrez_set:\n",
    "                # uniprot_id not found in mapping\n",
    "                continue\n",
    "            for entrez in entrez_set:\n",
    "                binding = template.copy()\n",
    "                binding['measure'] = measure\n",
    "                binding['affinity_nM'] = affinity\n",
    "                binding['uniprot'] = uniprot\n",
    "                binding['entrez_gene'] = entrez\n",
    "                bindings.append(binding)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "< 1267\n",
      "> 129495\n",
      "= 603956\n",
      "errors 19\n"
     ]
    }
   ],
   "source": [
    "# Convert affinities to floats\n",
    "lt, gt, eq, err = 0, 0, 0, 0\n",
    "for binding in bindings:\n",
    "    affinity = binding['affinity_nM']\n",
    "    if affinity.startswith('<'):\n",
    "        affinity = affinity.lstrip('<')\n",
    "        affinity = float(affinity)\n",
    "        if affinity >= 10.0:\n",
    "            affinity -= 1.0\n",
    "        lt += 1\n",
    "    elif affinity.startswith('>'):\n",
    "        affinity = affinity.lstrip('>')\n",
    "        affinity = float(affinity)\n",
    "        affinity += 1.0\n",
    "        gt += 1\n",
    "    else:\n",
    "        try:\n",
    "            affinity = float(affinity)\n",
    "            eq += 1\n",
    "        except ValueError:\n",
    "            affinity = None\n",
    "            err += 1\n",
    "    binding['affinity_nM'] = affinity\n",
    "print('< {}\\n> {}\\n= {}\\nerrors {}'.format(lt, gt, eq, err))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fields = ['reaction_id', 'bindingdb_id', 'uniprot', 'entrez_gene',\n",
    "          'measure', 'affinity_nM', 'source', 'organism', 'pubmed', 'doi']\n",
    "with gzip.open('data/binding.tsv.gz', 'wt') as write_file:\n",
    "    writer = csv.DictWriter(write_file, delimiter='\\t', fieldnames=fields)\n",
    "    writer.writeheader()\n",
    "    bindings.sort(key=operator.itemgetter(*fields))\n",
    "    writer.writerows(bindings)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Calculate summary and diagnostic information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Counter({frozenset({'IC50 (nM)'}): 462031,\n",
      "         frozenset({'Ki (nM)'}): 227288,\n",
      "         frozenset({'EC50 (nM)'}): 65727,\n",
      "         frozenset({'Kd (nM)'}): 52197,\n",
      "         frozenset({'Ki (nM)', 'IC50 (nM)'}): 1184,\n",
      "         frozenset(): 892,\n",
      "         frozenset({'IC50 (nM)', 'EC50 (nM)'}): 628,\n",
      "         frozenset({'Ki (nM)', 'EC50 (nM)'}): 574,\n",
      "         frozenset({'IC50 (nM)', 'Kd (nM)'}): 93,\n",
      "         frozenset({'Ki (nM)', 'IC50 (nM)', 'EC50 (nM)'}): 23,\n",
      "         frozenset({'EC50 (nM)', 'Kd (nM)'}): 8,\n",
      "         frozenset({'Ki (nM)', 'Kd (nM)'}): 4,\n",
      "         frozenset({'Ki (nM)', 'IC50 (nM)', 'Kd (nM)'}): 1})\n"
     ]
    }
   ],
   "source": [
    "# Measurement types\n",
    "path = os.path.join('download', filename + '.gz')\n",
    "bindingdb_generator = read_bindingdb(path)\n",
    "\n",
    "measure_keys = ['Ki (nM)', 'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)'] #, 'kon (M-1-s-1)', 'koff (s-1)']\n",
    "\n",
    "measures = list()\n",
    "for i, row in enumerate(bindingdb_generator):\n",
    "    if len(row['chains']) != 1:\n",
    "        continue\n",
    "    chain, = row['chains']\n",
    "    uniprot = chain['UniProt (SwissProt) Primary ID of Target Chain']\n",
    "    if not uniprot:\n",
    "        continue\n",
    "    measure_set = frozenset(key for key in measure_keys if row[key] is not None)\n",
    "    measures.append(measure_set)\n",
    "\n",
    "pprint.pprint(collections.Counter(measures))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({1: 1101095, 2: 31218, 3: 5393, 4: 526, 5: 302, 6: 343, 12: 3, 19: 1})"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Number of chains (proteins in target)\n",
    "path = os.path.join('download', filename + '.gz')\n",
    "bindingdb_generator = read_bindingdb(path)\n",
    "collections.Counter(int(row[chains_key]) for row in bindingdb_generator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({False: 290445, True: 810650})"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Targets that mapped to SwissProt\n",
    "path = os.path.join('download', filename + '.gz')\n",
    "bindingdb_generator = read_bindingdb(path)\n",
    "\n",
    "collections.Counter(\n",
    "    bool(row['chains'][0]['UniProt (SwissProt) Primary ID of Target Chain'])\n",
    "    for row in bindingdb_generator if len(row['chains']) == 1\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({'Rattus norvegicus': 78798,\n",
       "         'Vibrio harveyi': 60,\n",
       "         'Malus domestica': 4,\n",
       "         'Lymnaea stagnalis': 154,\n",
       "         'Macaca fascicularis': 271,\n",
       "         'Naja mossambica': 9,\n",
       "         'Thermus thermophilus': 13,\n",
       "         'Pseudomonas aeruginosa': 8,\n",
       "         'Varicella-zoster virus (strain Dumas)': 66,\n",
       "         'Human immunodeficiency virus type 1 group M subtype B (isolate HXB2)': 301,\n",
       "         'Pseudomonas fluorescens': 3,\n",
       "         'Streptococcus pyogenes': 24,\n",
       "         'Vibrio proteolyticus': 40,\n",
       "         'Musca domestica': 13,\n",
       "         'Mesocricetus auratus': 74,\n",
       "         'Bacillus amyloliquefaciens': 4,\n",
       "         'Hepatitis C virus genotype 1b (isolate Con1)': 34,\n",
       "         'Hepatitis C virus genotype 3a (isolate NZL1)': 38,\n",
       "         'Vibrio fischeri': 83,\n",
       "         'Saccharomyces cerevisiae': 17,\n",
       "         'Pichia angusta': 14,\n",
       "         'Clostridium perfringens': 124,\n",
       "         'Poliovirus type 1 (strain Mahoney)': 20,\n",
       "         'Klebsiella pneumoniae': 35,\n",
       "         'Mus musculus': 25316,\n",
       "         'Photinus pyralis': 96,\n",
       "         'Spiroplasma sp. (strain MQ-1)': 36,\n",
       "         'Humicola insolens': 7,\n",
       "         'Influenza A virus (strain A/Memphis/1/1971 H3N2)': 8,\n",
       "         'Influenza B virus (strain B/Lee/1940)': 224,\n",
       "         'Drosophila melanogaster': 130,\n",
       "         'Rhizobium radiobacter': 41,\n",
       "         'Toxoplasma gondii': 1205,\n",
       "         'Paramecium tetraurelia': 6,\n",
       "         'Dictyostelium discoideum': 7,\n",
       "         'Gallus gallus': 875,\n",
       "         'Influenza A virus (strain A/Tokyo/3/1967 H2N2)': 14,\n",
       "         'Neisseria gonorrhoeae': 103,\n",
       "         'Bacillus lentus': 36,\n",
       "         'Carica papaya': 201,\n",
       "         'Mycoplana ramosa': 11,\n",
       "         'Bacillus subtilis': 23,\n",
       "         'Streptococcus pneumoniae (strain ATCC BAA-255 / R6)': 11,\n",
       "         'Legionella pneumophila': 13,\n",
       "         'Stenotrophomonas maltophilia': 58,\n",
       "         'Yersinia pestis': 34,\n",
       "         'Thermoanaerobacter saccharolyticum': 2,\n",
       "         'Leishmania mexicana': 46,\n",
       "         'Human immunodeficiency virus type 1 group M subtype B (isolate BRU/LAI)': 1,\n",
       "         'Ovis aries': 1618,\n",
       "         'Staphylococcus aureus': 656,\n",
       "         'Methanosarcina thermophila': 79,\n",
       "         'Sus scrofa': 4259,\n",
       "         'Rhizopus oryzae': 2,\n",
       "         'Bos taurus': 13461,\n",
       "         'Human immunodeficiency virus type 1 group M subtype B (isolate PCV12)': 36,\n",
       "         'Plasmodium falciparum': 819,\n",
       "         'Leuconostoc mesenteroides': 168,\n",
       "         'Clostridium botulinum': 427,\n",
       "         'Vibrio harveyi (strain ATCC BAA-1116 / BB120)': 3,\n",
       "         'Lucilia cuprina': 24,\n",
       "         'Photuris pennsylvanica': 76,\n",
       "         'Citrobacter freundii': 9,\n",
       "         'Avian erythroblastosis virus (strain ES4)': 15,\n",
       "         'Human immunodeficiency virus type 1 group M subtype B (isolate YU-2)': 166,\n",
       "         'Streptococcus pyogenes serotype M1': 38,\n",
       "         'Bacillus anthracis': 407,\n",
       "         'Trypanosoma brucei brucei': 131,\n",
       "         'Thermus aquaticus': 95,\n",
       "         'Solanum tuberosum': 1,\n",
       "         'Vaccinia virus (strain Western Reserve)': 28,\n",
       "         'Human herpesvirus 1 (strain SC16)': 168,\n",
       "         'Human herpesvirus 6A (strain Uganda-1102)': 69,\n",
       "         'Aspergillus aculeatus': 5,\n",
       "         'Fujinami sarcoma virus': 1,\n",
       "         'Electrophorus electricus': 2784,\n",
       "         'Human herpesvirus 1 (strain 17)': 680,\n",
       "         'Serratia marcescens': 36,\n",
       "         'Rhizobium meliloti': 2,\n",
       "         'Aspergillus niger': 20,\n",
       "         'Pneumocystis carinii': 889,\n",
       "         'Bacillus licheniformis': 26,\n",
       "         'Plasmodium falciparum (isolate K1 / Thailand)': 966,\n",
       "         'West Nile virus': 158,\n",
       "         'Luciola lateralis': 22,\n",
       "         'Staphylococcus aureus (strain MRSA252)': 74,\n",
       "         'Penicillium janthinellum': 9,\n",
       "         'Apis mellifera': 33,\n",
       "         None: 1738,\n",
       "         'Streptomyces caespitosus': 4,\n",
       "         'Torpedo marmorata': 5,\n",
       "         'Danio rerio': 35,\n",
       "         'Staphylococcus aureus (strain Mu50 / ATCC 700699)': 46,\n",
       "         'Meleagris gallopavo': 98,\n",
       "         'Candida albicans': 491,\n",
       "         'Cavia porcellus': 7656,\n",
       "         'Canis familiaris': 312,\n",
       "         'Echis carinatus': 8,\n",
       "         'Providencia stuartii': 4,\n",
       "         'Carassius auratus': 39,\n",
       "         'Naja naja': 9,\n",
       "         'Human cytomegalovirus (strain AD169)': 375,\n",
       "         'Aedes aegypti': 34,\n",
       "         'Yersinia enterocolitica': 21,\n",
       "         'Actinomadura sp. (strain R39)': 32,\n",
       "         'Escherichia coli': 353,\n",
       "         'Plasmodium falciparum (isolate 3D7)': 100,\n",
       "         'Brassica oleracea var. capitata': 12,\n",
       "         'Streptococcus pneumoniae': 31,\n",
       "         'Haemophilus influenzae (strain ATCC 51907 / DSM 11121 / KW20 / Rd)': 1,\n",
       "         'Mycobacterium smegmatis': 4,\n",
       "         'Epstein-Barr virus (strain B95-8)': 30,\n",
       "         'Feline herpesvirus 1': 4,\n",
       "         'Abelson murine leukemia virus': 80,\n",
       "         'Bovine viral diarrhea virus (strain CP7)': 1,\n",
       "         'Human papillomavirus type 16': 42,\n",
       "         'Nicotiana tabacum': 13,\n",
       "         'Caenorhabditis elegans': 2245,\n",
       "         'Influenza A virus (strain A/Aichi/2/1968 H3N2)': 5,\n",
       "         'Influenza A virus (strain A/Puerto Rico/8/1934 H1N1)': 208,\n",
       "         'Xenopus laevis': 24,\n",
       "         'Glycine max': 171,\n",
       "         'Canis lupus dingo': 70,\n",
       "         'Hansenula anomala': 5,\n",
       "         'Canavalia ensiformis': 194,\n",
       "         'Agaricus bisporus': 813,\n",
       "         'Enterococcus faecium': 14,\n",
       "         'Naja melanoleuca': 8,\n",
       "         'Plasmodium falciparum (isolate FcB1 / Columbia)': 726,\n",
       "         'Lactobacillus fermentum': 4,\n",
       "         'Influenza A virus (strain A/Brevig Mission/1/1918 H1N1)': 27,\n",
       "         'Human papillomavirus type 11': 54,\n",
       "         'Bacillus thermoproteolyticus': 134,\n",
       "         'Enterobacter cloacae': 307,\n",
       "         'Pisum sativum': 9,\n",
       "         'Mycobacterium tuberculosis': 12,\n",
       "         'Human SARS coronavirus': 100,\n",
       "         'Oryctolagus cuniculus': 4261,\n",
       "         'Clostridium botulinum (strain Hall / ATCC 3502 / NCTC 13319 / Type A)': 179,\n",
       "         'Flavobacterium meningosepticum': 3,\n",
       "         'Crithidia fasciculata': 50,\n",
       "         'Equus caballus': 1720,\n",
       "         'Macaca mulatta': 100,\n",
       "         'Trypanosoma cruzi': 1222,\n",
       "         'Saccharomyces cerevisiae (strain ATCC 204508 / S288c)': 1383,\n",
       "         'Vibrio fischeri (strain ATCC 700601 / ES114)': 32,\n",
       "         'Human herpesvirus 1 (strain KOS)': 4,\n",
       "         'Human herpesvirus 1': 51,\n",
       "         'Ricinus communis': 71,\n",
       "         'Bison bison': 15,\n",
       "         'Lactobacillus casei': 700,\n",
       "         'Alcaligenes sp. (strain DSM 11172)': 44,\n",
       "         'Aeromonas hydrophila': 14,\n",
       "         'Human T-cell leukemia virus 1 (strain Japan ATK-1 subtype A)': 31,\n",
       "         'Enterobacteria phage T4': 77,\n",
       "         'Rhizopus chinensis': 11,\n",
       "         'Nipah virus': 7,\n",
       "         'Hordeum vulgare': 6,\n",
       "         'Caldocellum saccharolyticum': 41,\n",
       "         'Human papillomavirus type 1a': 5,\n",
       "         'Cryptosporidium parvum': 355,\n",
       "         'Enterobacteria phage lambda': 11,\n",
       "         'Leishmania major': 118,\n",
       "         'Zea mays': 46,\n",
       "         'Hepatitis C virus genotype 4a (isolate ED43)': 1,\n",
       "         'Woolly monkey sarcoma virus': 22,\n",
       "         'Human rhinovirus 16': 5,\n",
       "         'Human immunodeficiency virus type 1 group M subtype B (isolate MN)': 10,\n",
       "         'Human herpesvirus 2 (strain HG52)': 45,\n",
       "         'Influenza B virus (strain B/Memphis/3/1989)': 2,\n",
       "         'Arabidopsis thaliana': 173,\n",
       "         'Bombyx mori': 25,\n",
       "         'Oryza sativa subsp. japonica': 16,\n",
       "         'Homo sapiens': 645210,\n",
       "         'Bacillus cereus': 51,\n",
       "         'Staphylococcus aureus (strain MW2)': 42,\n",
       "         'Hepatitis C virus genotype 1b (isolate BK)': 1,\n",
       "         'Torpedo californica': 574,\n",
       "         'Pseudomonas putida': 37})"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Species\n",
    "path = os.path.join('download', filename + '.gz')\n",
    "bindingdb_generator = read_bindingdb(path)\n",
    "\n",
    "collections.Counter(\n",
    "    row['Target Source Organism According to Curator or DataSource']\n",
    "    for row in bindingdb_generator if\n",
    "    len(row['chains']) == 1 and \n",
    "    row['chains'][0]['UniProt (SwissProt) Primary ID of Target Chain']\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}