{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Requirements for running this notebook:\n", "- RDKit\n", "- SQLAlchemy\n", "- NumPy\n", "- Pandas\n", "- PyTables\n", "- A connection to a ChEMBL database!\n", "\n", "## This notebook extracts data from ChEMBL database and formats it for a multi-task neural network training problem." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem\n", "from rdkit.Chem import AllChem\n", "from rdkit.Chem import DataStructs\n", "from rdkit.Chem import rdMolDescriptors\n", "from sqlalchemy import create_engine\n", "from sqlalchemy.sql import text\n", "import numpy as np\n", "import pandas as pd\n", "import tables as tb\n", "from tables.atom import ObjectAtom\n", "import json" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Get data from ChEMBL\n", "- You can use our SQLite dump: http://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_25/chembl_25_sqlite.tar.gz" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "engine = create_engine('sqlite:///chembl_25/chembl_25_sqlite/chembl_25.db')\n", "# ex postgresql: 'postgresql://user:pass@host:5432/chembl_25'\n", "\n", "qtext = \"\"\"\n", "SELECT\n", " activities.doc_id AS doc_id,\n", " activities.standard_value AS standard_value,\n", " molecule_hierarchy.parent_molregno AS molregno,\n", " compound_structures.canonical_smiles AS canonical_smiles,\n", " molecule_dictionary.chembl_id AS chembl_id,\n", " target_dictionary.tid AS tid,\n", " target_dictionary.chembl_id AS target_chembl_id,\n", " protein_family_classification.l1 AS l1,\n", " protein_family_classification.l2 AS l2,\n", " protein_family_classification.l3 AS l3\n", "FROM activities\n", " JOIN assays ON activities.assay_id = assays.assay_id\n", " JOIN target_dictionary ON assays.tid = target_dictionary.tid\n", " JOIN target_components ON target_dictionary.tid = target_components.tid\n", " JOIN component_class ON target_components.component_id = component_class.component_id\n", " JOIN protein_family_classification ON component_class.protein_class_id = protein_family_classification.protein_class_id\n", " JOIN molecule_dictionary ON activities.molregno = molecule_dictionary.molregno\n", " JOIN molecule_hierarchy ON molecule_dictionary.molregno = molecule_hierarchy.molregno\n", " JOIN compound_structures ON molecule_hierarchy.parent_molregno = compound_structures.molregno\n", "WHERE activities.standard_units = 'nM' AND\n", " activities.standard_type IN ('EC50', 'IC50', 'Ki', 'Kd', 'XC50', 'AC50', 'Potency') AND\n", " activities.data_validity_comment IS NULL AND\n", " activities.standard_relation IN ('=', '<') AND\n", " activities.potential_duplicate = 0 AND assays.confidence_score >= 8 AND\n", " target_dictionary.target_type = 'SINGLE PROTEIN'\"\"\"\n", "\n", "with engine.begin() as conn:\n", " res = conn.execute(text(qtext))\n", " df = pd.DataFrame(res.fetchall())\n", "\n", "df.columns = res.keys()\n", "df = df.where((pd.notnull(df)), None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Drop duplicate activities keeping the activity with lower concentration for each molecule-target pair" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = df.sort_values(by=['standard_value', 'molregno', 'tid'], ascending=True)\n", "df = df.drop_duplicates(subset=['molregno', 'tid'], keep='first')\n", "\n", "# save to csv\n", "df.to_csv('chembl_activity_data.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Set to active/inactive by threshold\n", "- Depending on family type from IDG: https://druggablegenome.net/ProteinFam\n", "\n", " - Kinases: <= 30nM\n", " - GPCRs: <= 100nM\n", " - Nuclear Receptors: <= 100nM\n", " - Ion Channels: <= 10μM\n", " - Non-IDG Family Targets: <= 1μM\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def set_active(row):\n", " active = 0\n", " if row['standard_value'] <= 1000:\n", " active = 1\n", " if row['l1'] == 'Ion channel':\n", " if row['standard_value'] <= 10000:\n", " active = 1\n", " if row['l2'] == 'Kinase':\n", " if row['standard_value'] > 30:\n", " active = 0\n", " if row['l2'] == 'Nuclear receptor':\n", " if row['standard_value'] > 100:\n", " active = 0\n", " if row['l3'] and 'GPCR' in row['l3']:\n", " if row['standard_value'] > 100:\n", " active = 0\n", " return active\n", "\n", "df['active'] = df.apply(lambda row: set_active(row), axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Filter target data\n", "\n", "- Keep targets mentioned at least in two different docs\n", "- Keep targets with at least 100 active and 100 inactive molecules. Threshold set to 100 to get a 'small' dataset that will train faster on this example." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# get targets with at least 100 different active molecules\n", "acts = df[df['active'] == 1].groupby(['target_chembl_id']).agg('count')\n", "acts = acts[acts['molregno'] >= 100].reset_index()['target_chembl_id']\n", "\n", "# get targets with at least 100 different inactive molecules\n", "inacts = df[df['active'] == 0].groupby(['target_chembl_id']).agg('count')\n", "inacts = inacts[inacts['molregno'] >= 100].reset_index()['target_chembl_id']\n", "\n", "# get targets mentioned in at least two docs\n", "docs = df.drop_duplicates(subset=['doc_id', 'target_chembl_id'])\n", "docs = docs.groupby(['target_chembl_id']).agg('count')\n", "docs = docs[docs['doc_id'] >= 2.0].reset_index()['target_chembl_id']\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of unique targets: 560\n", " Ion channel: 5\n", " Kinase: 96\n", " Nuclear receptor: 21\n", " GPCR: 180\n", " Others: 258\n" ] } ], "source": [ "t_keep = set(acts).intersection(set(inacts)).intersection(set(docs))\n", "\n", "# get dta for filtered targets\n", "activities = df[df['target_chembl_id'].isin(t_keep)]\n", "\n", "ion = pd.unique(activities[activities['l1'] == 'Ion channel']['tid']).shape[0]\n", "kin = pd.unique(activities[activities['l2'] == 'Kinase']['tid']).shape[0]\n", "nuc = pd.unique(activities[activities['l2'] == 'Nuclear receptor']['tid']).shape[0]\n", "gpcr = pd.unique(activities[activities['l3'].str.contains('GPCR', na=False)]['tid']).shape[0]\n", "\n", "print('Number of unique targets: ', len(t_keep))\n", "print(' Ion channel: ', ion)\n", "print(' Kinase: ', kin)\n", "print(' Nuclear receptor: ', nuc)\n", "print(' GPCR: ', gpcr)\n", "print(' Others: ', len(t_keep) - ion - kin - nuc - gpcr)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# save it to a file\n", "activities.to_csv('chembl_activity_data_filtered.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Prepare the label matrix for the multi-task deep neural network\n", "\n", " - known active = 1\n", " - known no-active = 0\n", " - unknown activity = -1, so we'll be able to easilly filter them and won't be taken into account when calculating the loss during model training.\n", " \n", "The matrix is extremely sparse so using sparse matrices (COO/CSR/CSC) should be considered. There are a couple of issues making it a bit tricker than what it should be so we'll keep the example without them.\n", "\n", "- https://github.com/pytorch/pytorch/issues/20248\n", "- https://github.com/scipy/scipy/issues/7531\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def gen_dict(group):\n", " return {tid: act for tid, act in zip(group['target_chembl_id'], group['active'])}\n", "\n", "group = activities.groupby('chembl_id')\n", "temp = pd.DataFrame(group.apply(gen_dict))\n", "mt_df = pd.DataFrame(temp[0].tolist())\n", "mt_df['chembl_id'] = temp.index\n", "mt_df = mt_df.where((pd.notnull(mt_df)), -1)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "structs = activities[['chembl_id', 'canonical_smiles']].drop_duplicates(subset='chembl_id')\n", "\n", "# drop mols not sanitizing on rdkit\n", "structs['romol'] = structs.apply(lambda row: Chem.MolFromSmiles(row['canonical_smiles']), axis=1)\n", "structs = structs.dropna()\n", "del structs['romol']\n", "\n", "# add the structures to the final df\n", "mt_df = pd.merge(structs, mt_df, how='inner', on='chembl_id')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# save to csv\n", "mt_df.to_csv('chembl_multi_task_data.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Calc fingeprints and save data to a PyTables H5 file" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "FP_SIZE = 1024\n", "RADIUS = 2\n", "\n", "def calc_fp(smiles, fp_size, radius):\n", " \"\"\"\n", " calcs morgan fingerprints as a numpy array.\n", " \"\"\"\n", " mol = Chem.MolFromSmiles(smiles, sanitize=False)\n", " mol.UpdatePropertyCache(False)\n", " Chem.GetSSSR(mol)\n", " fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=fp_size)\n", " a = np.zeros((0,), dtype=np.float32)\n", " Chem.DataStructs.ConvertToNumpyArray(fp, a)\n", " return a\n", "\n", "# calc fps\n", "descs = [calc_fp(smi, FP_SIZE, RADIUS) for smi in mt_df['canonical_smiles'].values]\n", "descs = np.asarray(descs, dtype=np.float32)\n", "\n", "# put all training data in a pytables file\n", "with tb.open_file('mt_data.h5', mode='w') as t_file:\n", "\n", " # set compression filter. It will make the file much smaller\n", " filters = tb.Filters(complib='blosc', complevel=5)\n", "\n", " # save chembl_ids\n", " tatom = ObjectAtom()\n", " cids = t_file.create_vlarray(t_file.root, 'chembl_ids', atom=tatom)\n", " for cid in mt_df['chembl_id'].values:\n", " cids.append(cid)\n", "\n", " # save fps\n", " fatom = tb.Atom.from_dtype(descs.dtype)\n", " fps = t_file.create_carray(t_file.root, 'fps', fatom, descs.shape, filters=filters)\n", " fps[:] = descs\n", "\n", " del mt_df['chembl_id']\n", " del mt_df['canonical_smiles']\n", "\n", " # save target chembl ids\n", " tcids = t_file.create_vlarray(t_file.root, 'target_chembl_ids', atom=tatom)\n", " for tcid in mt_df.columns.values:\n", " tcids.append(tcid)\n", "\n", " # save labels\n", " labs = t_file.create_carray(t_file.root, 'labels', fatom, mt_df.values.shape, filters=filters)\n", " labs[:] = mt_df.values\n", " \n", " # save task weights\n", " # each task loss will be weighted inversely proportional to its number of data points\n", " weights = []\n", " for col in mt_df.columns.values:\n", " c = mt_df[mt_df[col] >= 0.0].shape[0]\n", " weights.append(1 / c)\n", " weights = np.array(weights)\n", " ws = t_file.create_carray(t_file.root, 'weights', fatom, weights.shape)\n", " ws[:] = weights" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Open H5 file and show the shape of all collections" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(711591,)\n", "(560,)\n", "(711591, 1024)\n", "(711591, 560)\n", "(560,)\n" ] } ], "source": [ "with tb.open_file('mt_data.h5', mode='r') as t_file:\n", " print(t_file.root.chembl_ids.shape)\n", " print(t_file.root.target_chembl_ids.shape)\n", " print(t_file.root.fps.shape)\n", " print(t_file.root.labels.shape)\n", " print(t_file.root.weights.shape)\n", " \n", " # save targets to a json file\n", " with open('targets.json', 'w') as f:\n", " json.dump(t_file.root.target_chembl_ids[:], f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }