{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Requirements for running this notebook:\n",
    "- RDKit\n",
    "- SQLAlchemy\n",
    "- NumPy\n",
    "- Pandas\n",
    "- PyTables\n",
    "- A connection to a ChEMBL database!\n",
    "\n",
    "## This notebook extracts data from ChEMBL database and formats it for a multi-task neural network training problem."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from rdkit import Chem\n",
    "from rdkit.Chem import AllChem\n",
    "from rdkit.Chem import DataStructs\n",
    "from rdkit.Chem import rdMolDescriptors\n",
    "from sqlalchemy import create_engine\n",
    "from sqlalchemy.sql import text\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import tables as tb\n",
    "from tables.atom import ObjectAtom\n",
    "import json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get data from ChEMBL\n",
    "- You can use our SQLite dump: http://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_25/chembl_25_sqlite.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "engine = create_engine('sqlite:///chembl_25/chembl_25_sqlite/chembl_25.db')\n",
    "# ex postgresql: 'postgresql://user:pass@host:5432/chembl_25'\n",
    "\n",
    "qtext = \"\"\"\n",
    "SELECT\n",
    "  activities.doc_id                    AS doc_id,\n",
    "  activities.standard_value            AS standard_value,\n",
    "  molecule_hierarchy.parent_molregno   AS molregno,\n",
    "  compound_structures.canonical_smiles AS canonical_smiles,\n",
    "  molecule_dictionary.chembl_id        AS chembl_id,\n",
    "  target_dictionary.tid                AS tid,\n",
    "  target_dictionary.chembl_id          AS target_chembl_id,\n",
    "  protein_family_classification.l1     AS l1,\n",
    "  protein_family_classification.l2     AS l2,\n",
    "  protein_family_classification.l3     AS l3\n",
    "FROM activities\n",
    "  JOIN assays ON activities.assay_id = assays.assay_id\n",
    "  JOIN target_dictionary ON assays.tid = target_dictionary.tid\n",
    "  JOIN target_components ON target_dictionary.tid = target_components.tid\n",
    "  JOIN component_class ON target_components.component_id = component_class.component_id\n",
    "  JOIN protein_family_classification ON component_class.protein_class_id = protein_family_classification.protein_class_id\n",
    "  JOIN molecule_dictionary ON activities.molregno = molecule_dictionary.molregno\n",
    "  JOIN molecule_hierarchy ON molecule_dictionary.molregno = molecule_hierarchy.molregno\n",
    "  JOIN compound_structures ON molecule_hierarchy.parent_molregno = compound_structures.molregno\n",
    "WHERE activities.standard_units = 'nM' AND\n",
    "      activities.standard_type IN ('EC50', 'IC50', 'Ki', 'Kd', 'XC50', 'AC50', 'Potency') AND\n",
    "      activities.data_validity_comment IS NULL AND\n",
    "      activities.standard_relation IN ('=', '<') AND\n",
    "      activities.potential_duplicate = 0 AND assays.confidence_score >= 8 AND\n",
    "      target_dictionary.target_type = 'SINGLE PROTEIN'\"\"\"\n",
    "\n",
    "with engine.begin() as conn:\n",
    "    res = conn.execute(text(qtext))\n",
    "    df = pd.DataFrame(res.fetchall())\n",
    "\n",
    "df.columns = res.keys()\n",
    "df = df.where((pd.notnull(df)), None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Drop duplicate activities keeping the activity with lower concentration for each molecule-target pair"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.sort_values(by=['standard_value', 'molregno', 'tid'], ascending=True)\n",
    "df = df.drop_duplicates(subset=['molregno', 'tid'], keep='first')\n",
    "\n",
    "# save to csv\n",
    "df.to_csv('chembl_activity_data.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Set to active/inactive by threshold\n",
    "- Depending on family type from IDG: https://druggablegenome.net/ProteinFam\n",
    "\n",
    "    - Kinases: <= 30nM\n",
    "    - GPCRs: <= 100nM\n",
    "    - Nuclear Receptors: <= 100nM\n",
    "    - Ion Channels: <= 10μM\n",
    "    - Non-IDG Family Targets: <= 1μM\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def set_active(row):\n",
    "    active = 0\n",
    "    if row['standard_value'] <= 1000:\n",
    "        active = 1\n",
    "    if row['l1'] == 'Ion channel':\n",
    "        if row['standard_value'] <= 10000:\n",
    "            active = 1\n",
    "    if row['l2'] == 'Kinase':\n",
    "        if row['standard_value'] > 30:\n",
    "            active = 0\n",
    "    if row['l2'] == 'Nuclear receptor':\n",
    "        if row['standard_value'] > 100:\n",
    "            active = 0\n",
    "    if row['l3'] and 'GPCR' in row['l3']:\n",
    "        if row['standard_value'] > 100:\n",
    "            active = 0\n",
    "    return active\n",
    "\n",
    "df['active'] = df.apply(lambda row: set_active(row), axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Filter target data\n",
    "\n",
    "- Keep targets mentioned at least in two different docs\n",
    "- Keep targets with at least 100 active and 100 inactive molecules. Threshold set to 100 to get a 'small' dataset that will train faster on this example."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get targets with at least 100 different active molecules\n",
    "acts = df[df['active'] == 1].groupby(['target_chembl_id']).agg('count')\n",
    "acts = acts[acts['molregno'] >= 100].reset_index()['target_chembl_id']\n",
    "\n",
    "# get targets with at least 100 different inactive molecules\n",
    "inacts = df[df['active'] == 0].groupby(['target_chembl_id']).agg('count')\n",
    "inacts = inacts[inacts['molregno'] >= 100].reset_index()['target_chembl_id']\n",
    "\n",
    "# get targets mentioned in at least two docs\n",
    "docs = df.drop_duplicates(subset=['doc_id', 'target_chembl_id'])\n",
    "docs = docs.groupby(['target_chembl_id']).agg('count')\n",
    "docs = docs[docs['doc_id'] >= 2.0].reset_index()['target_chembl_id']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of unique targets:  560\n",
      "  Ion channel:  5\n",
      "  Kinase:  96\n",
      "  Nuclear receptor:  21\n",
      "  GPCR:  180\n",
      "  Others:  258\n"
     ]
    }
   ],
   "source": [
    "t_keep = set(acts).intersection(set(inacts)).intersection(set(docs))\n",
    "\n",
    "# get dta for filtered targets\n",
    "activities = df[df['target_chembl_id'].isin(t_keep)]\n",
    "\n",
    "ion = pd.unique(activities[activities['l1'] == 'Ion channel']['tid']).shape[0]\n",
    "kin = pd.unique(activities[activities['l2'] == 'Kinase']['tid']).shape[0]\n",
    "nuc = pd.unique(activities[activities['l2'] == 'Nuclear receptor']['tid']).shape[0]\n",
    "gpcr = pd.unique(activities[activities['l3'].str.contains('GPCR', na=False)]['tid']).shape[0]\n",
    "\n",
    "print('Number of unique targets: ', len(t_keep))\n",
    "print('  Ion channel: ', ion)\n",
    "print('  Kinase: ', kin)\n",
    "print('  Nuclear receptor: ',  nuc)\n",
    "print('  GPCR: ', gpcr)\n",
    "print('  Others: ', len(t_keep) - ion - kin - nuc - gpcr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save it to a file\n",
    "activities.to_csv('chembl_activity_data_filtered.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare the label matrix for the multi-task deep neural network\n",
    "\n",
    " - known active = 1\n",
    " - known no-active = 0\n",
    " - unknown activity = -1, so we'll be able to easilly filter them and won't be taken into account when calculating the loss during model training.\n",
    " \n",
    "The matrix is extremely sparse so using sparse matrices (COO/CSR/CSC) should be considered. There are a couple of issues making it a bit tricker than what it should be so we'll keep the example without them.\n",
    "\n",
    "- https://github.com/pytorch/pytorch/issues/20248\n",
    "- https://github.com/scipy/scipy/issues/7531\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def gen_dict(group):\n",
    "    return {tid: act  for tid, act in zip(group['target_chembl_id'], group['active'])}\n",
    "\n",
    "group = activities.groupby('chembl_id')\n",
    "temp = pd.DataFrame(group.apply(gen_dict))\n",
    "mt_df = pd.DataFrame(temp[0].tolist())\n",
    "mt_df['chembl_id'] = temp.index\n",
    "mt_df = mt_df.where((pd.notnull(mt_df)), -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "structs = activities[['chembl_id', 'canonical_smiles']].drop_duplicates(subset='chembl_id')\n",
    "\n",
    "# drop mols not sanitizing on rdkit\n",
    "structs['romol'] = structs.apply(lambda row: Chem.MolFromSmiles(row['canonical_smiles']), axis=1)\n",
    "structs = structs.dropna()\n",
    "del structs['romol']\n",
    "\n",
    "# add the structures to the final df\n",
    "mt_df = pd.merge(structs, mt_df, how='inner', on='chembl_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save to csv\n",
    "mt_df.to_csv('chembl_multi_task_data.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calc fingeprints and save data to a PyTables H5 file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "FP_SIZE = 1024\n",
    "RADIUS = 2\n",
    "\n",
    "def calc_fp(smiles, fp_size, radius):\n",
    "    \"\"\"\n",
    "    calcs morgan fingerprints as a numpy array.\n",
    "    \"\"\"\n",
    "    mol = Chem.MolFromSmiles(smiles, sanitize=False)\n",
    "    mol.UpdatePropertyCache(False)\n",
    "    Chem.GetSSSR(mol)\n",
    "    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=fp_size)\n",
    "    a = np.zeros((0,), dtype=np.float32)\n",
    "    Chem.DataStructs.ConvertToNumpyArray(fp, a)\n",
    "    return a\n",
    "\n",
    "# calc fps\n",
    "descs = [calc_fp(smi, FP_SIZE, RADIUS) for smi in mt_df['canonical_smiles'].values]\n",
    "descs = np.asarray(descs, dtype=np.float32)\n",
    "\n",
    "# put all training data in a pytables file\n",
    "with tb.open_file('mt_data.h5', mode='w') as t_file:\n",
    "\n",
    "    # set compression filter. It will make the file much smaller\n",
    "    filters = tb.Filters(complib='blosc', complevel=5)\n",
    "\n",
    "    # save chembl_ids\n",
    "    tatom = ObjectAtom()\n",
    "    cids = t_file.create_vlarray(t_file.root, 'chembl_ids', atom=tatom)\n",
    "    for cid in mt_df['chembl_id'].values:\n",
    "        cids.append(cid)\n",
    "\n",
    "    # save fps\n",
    "    fatom = tb.Atom.from_dtype(descs.dtype)\n",
    "    fps = t_file.create_carray(t_file.root, 'fps', fatom, descs.shape, filters=filters)\n",
    "    fps[:] = descs\n",
    "\n",
    "    del mt_df['chembl_id']\n",
    "    del mt_df['canonical_smiles']\n",
    "\n",
    "    # save target chembl ids\n",
    "    tcids = t_file.create_vlarray(t_file.root, 'target_chembl_ids', atom=tatom)\n",
    "    for tcid in mt_df.columns.values:\n",
    "        tcids.append(tcid)\n",
    "\n",
    "    # save labels\n",
    "    labs = t_file.create_carray(t_file.root, 'labels', fatom, mt_df.values.shape, filters=filters)\n",
    "    labs[:] = mt_df.values\n",
    "    \n",
    "    # save task weights\n",
    "    # each task loss will be weighted inversely proportional to its number of data points\n",
    "    weights = []\n",
    "    for col in mt_df.columns.values:\n",
    "        c = mt_df[mt_df[col] >= 0.0].shape[0]\n",
    "        weights.append(1 / c)\n",
    "    weights = np.array(weights)\n",
    "    ws = t_file.create_carray(t_file.root, 'weights', fatom, weights.shape)\n",
    "    ws[:] = weights"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Open H5 file and show the shape of all collections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(711591,)\n",
      "(560,)\n",
      "(711591, 1024)\n",
      "(711591, 560)\n",
      "(560,)\n"
     ]
    }
   ],
   "source": [
    "with tb.open_file('mt_data.h5', mode='r') as t_file:\n",
    "    print(t_file.root.chembl_ids.shape)\n",
    "    print(t_file.root.target_chembl_ids.shape)\n",
    "    print(t_file.root.fps.shape)\n",
    "    print(t_file.root.labels.shape)\n",
    "    print(t_file.root.weights.shape)\n",
    "    \n",
    "    # save targets to a json file\n",
    "    with open('targets.json', 'w') as f:\n",
    "        json.dump(t_file.root.target_chembl_ids[:], f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}