{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Analysis of chemical similarity in `python` using `rdkit`\n", "\n", "This notebook shows simple examples of finding and comparing the chemical [fingerprints](http://rdkit.org/UGM/2012/Landrum_RDKit_UGM.Fingerprints.Final.pptx.pdf) of molecular structures using [`rdkit`](http://www.rdkit.org/docs/GettingStartedInPython.html).\n", "\n", "\n", "\n", "### Required libraries and modules\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# numpy and matplotlib\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "import matplotlib.cm as cm\n", "\n", "# scipy\n", "from scipy.spatial.distance import pdist, squareform\n", "import scipy.cluster.hierarchy as hc\n", "\n", "# seaborn -- for better looking plots\n", "import seaborn as sns\n", "\n", "# pandas \n", "import pandas as pd\n", "\n", "# rdkit\n", "from rdkit import Chem\n", "from rdkit.Chem import Draw\n", "from rdkit.Chem import rdmolops\n", "from rdkit.Chem import Descriptors\n", "from rdkit import DataStructs\n", "from rdkit.Chem import rdMolDescriptors" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Clustering molecules by fingerprint" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'c1ccsc1': 'thiophene', 'CC': 'ethane', 'c1ccccc1': 'benzene', 'CC(O)=O': 'acetic acid', 'C=CC(=O)N': 'acrylamide', 'CCO': 'ethanol', 'Nc1ccccc1': 'aniline', '[Na+].[O-]C(=O)C': 'sodium acetate', '[Na+].[Cl-]': 'sodium chloride', 'CCC': 'propane'}\n" ] } ], "source": [ "smiles_list = ['CC',\n", " 'CCC',\n", " 'CCO',\n", " 'c1ccccc1',\n", " '[Na+].[Cl-]',\n", " '[Na+].[O-]C(=O)C',\n", " 'c1ccsc1',\n", " 'CC(O)=O',\n", " 'C=CC(=O)N',\n", " 'Nc1ccccc1']\n", "\n", "names_list = ['ethane',\n", " 'propane',\n", " 'ethanol',\n", " 'benzene',\n", " 'sodium chloride',\n", " 'sodium acetate',\n", " 'thiophene',\n", " 'acetic acid', \n", " 'acrylamide',\n", " 'aniline']\n", "\n", "names_dict = {smiles: name for (smiles, name) in zip(smiles_list, names_list)}\n", "print names_dict\n", "\n", "mols = []\n", "fingerprints = []\n", "\n", "for smiles in smiles_list:\n", " mol = Chem.MolFromSmiles(smiles)\n", " mols.append(mol)\n", "\n", "fingerprint_mat = np.vstack(np.asarray(rdmolops.RDKFingerprint(mol, fpSize = 2048), dtype = 'bool') for mol in mols)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | CC | \n", "CCC | \n", "CCO | \n", "c1ccccc1 | \n", "[Na+].[Cl-] | \n", "[Na+].[O-]C(=O)C | \n", "c1ccsc1 | \n", "CC(O)=O | \n", "C=CC(=O)N | \n", "Nc1ccccc1 | \n", "
---|---|---|---|---|---|---|---|---|---|---|
CC | \n", "0.000000 | \n", "0.500 | \n", "0.666667 | \n", "1.000000 | \n", "1 | \n", "0.857143 | \n", "1.000000 | \n", "0.857143 | \n", "0.913043 | \n", "1.000000 | \n", "
CCC | \n", "0.500000 | \n", "0.000 | \n", "0.750000 | \n", "1.000000 | \n", "1 | \n", "0.875000 | \n", "1.000000 | \n", "0.875000 | \n", "0.920000 | \n", "1.000000 | \n", "
CCO | \n", "0.666667 | \n", "0.750 | \n", "0.000000 | \n", "1.000000 | \n", "1 | \n", "0.571429 | \n", "1.000000 | \n", "0.571429 | \n", "0.925926 | \n", "1.000000 | \n", "
c1ccccc1 | \n", "1.000000 | \n", "1.000 | \n", "1.000000 | \n", "0.000000 | \n", "1 | \n", "0.960000 | \n", "0.800000 | \n", "0.960000 | \n", "0.970588 | \n", "0.684211 | \n", "
[Na+].[Cl-] | \n", "1.000000 | \n", "1.000 | \n", "1.000000 | \n", "1.000000 | \n", "0 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "
[Na+].[O-]C(=O)C | \n", "0.857143 | \n", "0.875 | \n", "0.571429 | \n", "0.960000 | \n", "1 | \n", "0.000000 | \n", "1.000000 | \n", "0.000000 | \n", "0.806452 | \n", "0.980392 | \n", "
c1ccsc1 | \n", "1.000000 | \n", "1.000 | \n", "1.000000 | \n", "0.800000 | \n", "1 | \n", "1.000000 | \n", "0.000000 | \n", "1.000000 | \n", "0.978261 | \n", "0.892857 | \n", "
CC(O)=O | \n", "0.857143 | \n", "0.875 | \n", "0.571429 | \n", "0.960000 | \n", "1 | \n", "0.000000 | \n", "1.000000 | \n", "0.000000 | \n", "0.806452 | \n", "0.980392 | \n", "
C=CC(=O)N | \n", "0.913043 | \n", "0.920 | \n", "0.925926 | \n", "0.970588 | \n", "1 | \n", "0.806452 | \n", "0.978261 | \n", "0.806452 | \n", "0.000000 | \n", "0.983333 | \n", "
Nc1ccccc1 | \n", "1.000000 | \n", "1.000 | \n", "1.000000 | \n", "0.684211 | \n", "1 | \n", "0.980392 | \n", "0.892857 | \n", "0.980392 | \n", "0.983333 | \n", "0.000000 | \n", "