{ "cells": [ { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import itertools\n", "import gzip\n", "\n", "import pandas\n", "import rdkit.Chem\n", "import rdkit.Chem.AllChem\n", "import rdkit.DataStructs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2015-05-18 14:03:23-- http://www.drugbank.ca/system/downloads/current/structures/all.sdf.zip\n", "Resolving www.drugbank.ca (www.drugbank.ca)... 162.243.242.197\n", "Connecting to www.drugbank.ca (www.drugbank.ca)|162.243.242.197|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 4362948 (4.2M) [application/zip]\n", "Server file no newer than local file ‘download/all.sdf.zip’ -- not retrieving.\n", "\n", "Archive: download/all.sdf.zip\n", " inflating: download/all.sdf \n" ] } ], "source": [ "# Download DrugBank SDF file of structures\n", "! wget --timestamping --directory-prefix download http://www.drugbank.ca/system/downloads/current/structures/all.sdf.zip\n", "! unzip -d download download/all.sdf.zip\n", "! rm download/all.sdf.zip" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "6743" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Read SDF File\n", "supplier = rdkit.Chem.SDMolSupplier('download/all.sdf')\n", "molecules = [mol for mol in supplier if mol is not None]\n", "len(molecules)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Calculate fingerprints\n", "fingerprints = dict()\n", "for mol in molecules:\n", " drugbank_id = mol.GetProp('DATABASE_ID')\n", " fingerprint = rdkit.Chem.AllChem.GetMorganFingerprint(mol, 2)\n", " fingerprints[drugbank_id] = fingerprint" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Calculate pairwise compound similarities\n", "similarity_rows = list()\n", "for (id0, fp0), (id1, fp1) in itertools.combinations(fingerprints.items(), 2):\n", " similarity = rdkit.DataStructs.DiceSimilarity(fp0, fp1)\n", " similarity = round(similarity, 4)\n", " similarity_rows.append([id0, id1, similarity])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | compound0 | \n", "compound1 | \n", "similarity | \n", "
---|---|---|---|
0 | \n", "DB05107 | \n", "DB08426 | \n", "0.0966 | \n", "
1 | \n", "DB05107 | \n", "DB05105 | \n", "0.0441 | \n", "
2 | \n", "DB05107 | \n", "DB05104 | \n", "0.1457 | \n", "
3 | \n", "DB05107 | \n", "DB08423 | \n", "0.1268 | \n", "
4 | \n", "DB05107 | \n", "DB05102 | \n", "0.1000 | \n", "
5 rows × 3 columns
\n", "