{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction\n",
    "\n",
    "This notebook gives you a short introduction on how to use Dask to parallelize model training, particularly if you have multiple learning tasks on which you want to train individual models for.\n",
    "\n",
    "For brevity, I will not be elaborating on the exact machine learning task here, but focus on the idioms that we need to use Dask for this task."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format = 'retina'\n",
    "\n",
    "from dask.distributed import LocalCluster, Client\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import janitor"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Instantiate a Dask Cluster\n",
    "\n",
    "Here, we instantiate a Dask `cluster` (this is only a `LocalCluster`, but other cluster types can be created too, such as an `SGECluster` or `KubeCluster`. We then connect a `client` to the cluster."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ericmjl/anaconda/envs/minimal-panel/lib/python3.7/site-packages/distributed/dashboard/core.py:72: UserWarning: \n",
      "Port 8787 is already in use. \n",
      "Perhaps you already have a cluster running?\n",
      "Hosting the diagnostics dashboard on a random port instead.\n",
      "  warnings.warn(\"\\n\" + msg)\n"
     ]
    }
   ],
   "source": [
    "client = Client()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Preprocessing\n",
    "\n",
    "We will now preprocess our data and get it into a shape for machine learning."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import molecular_weights, featurize_sequence_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "drugs = ['ATV', 'DRV', 'FPV', 'IDV', 'LPV', 'NFV', 'SQV', 'TPV']\n",
    "\n",
    "data = (\n",
    "    pd.read_csv(\"data/hiv-protease-data-expanded.csv\", index_col=0)\n",
    "    .query(\"weight == 1.0\")\n",
    "    .transform_column(\"sequence\", lambda x: len(x), \"seq_length\")\n",
    "    .query(\"seq_length == 99\")\n",
    "    .transform_column(\"sequence\", featurize_sequence_, \"features\")\n",
    "    .transform_columns(drugs, np.log10)\n",
    ")\n",
    "\n",
    "features = pd.DataFrame(np.vstack(data['features'])).set_index(data.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ATV</th>\n",
       "      <th>DRV</th>\n",
       "      <th>FPV</th>\n",
       "      <th>IDV</th>\n",
       "      <th>LPV</th>\n",
       "      <th>NFV</th>\n",
       "      <th>SQV</th>\n",
       "      <th>SeqID</th>\n",
       "      <th>TPV</th>\n",
       "      <th>seqid</th>\n",
       "      <th>sequence</th>\n",
       "      <th>sequence_object</th>\n",
       "      <th>weight</th>\n",
       "      <th>seq_length</th>\n",
       "      <th>features</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1.50515</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.477121</td>\n",
       "      <td>1.544068</td>\n",
       "      <td>1.50515</td>\n",
       "      <td>1.462398</td>\n",
       "      <td>2.214844</td>\n",
       "      <td>4426</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4426-0</td>\n",
       "      <td>PQITLWQRPIVTIKIGGQLKEALLDTGADDTVLEEMNLPGKWKPKM...</td>\n",
       "      <td>ID: 4426-0\\nName: &lt;unknown name&gt;\\nDescription:...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>99</td>\n",
       "      <td>[[115.131, 146.1451, 131.1736, 119.1197, 131.1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.176091</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.342423</td>\n",
       "      <td>0.041393</td>\n",
       "      <td>4432</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4432-0</td>\n",
       "      <td>PQITLWQRPLVTVKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...</td>\n",
       "      <td>ID: 4432-0\\nName: &lt;unknown name&gt;\\nDescription:...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>99</td>\n",
       "      <td>[[115.131, 146.1451, 131.1736, 119.1197, 131.1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.491362</td>\n",
       "      <td>0.939519</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.505150</td>\n",
       "      <td>1.227887</td>\n",
       "      <td>4664</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4664-0</td>\n",
       "      <td>PQITLWQRPIVTIKVGGQLIEALLDTGADDTVLEEINLPGRWKPKM...</td>\n",
       "      <td>ID: 4664-0\\nName: &lt;unknown name&gt;\\nDescription:...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>99</td>\n",
       "      <td>[[115.131, 146.1451, 131.1736, 119.1197, 131.1...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        ATV  DRV       FPV       IDV      LPV       NFV       SQV  SeqID  TPV  \\\n",
       "6   1.50515  NaN  0.477121  1.544068  1.50515  1.462398  2.214844   4426  NaN   \n",
       "7       NaN  NaN  0.176091  0.000000      NaN  0.342423  0.041393   4432  NaN   \n",
       "14      NaN  NaN  0.491362  0.939519      NaN  1.505150  1.227887   4664  NaN   \n",
       "\n",
       "     seqid                                           sequence  \\\n",
       "6   4426-0  PQITLWQRPIVTIKIGGQLKEALLDTGADDTVLEEMNLPGKWKPKM...   \n",
       "7   4432-0  PQITLWQRPLVTVKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...   \n",
       "14  4664-0  PQITLWQRPIVTIKVGGQLIEALLDTGADDTVLEEINLPGRWKPKM...   \n",
       "\n",
       "                                      sequence_object  weight  seq_length  \\\n",
       "6   ID: 4426-0\\nName: <unknown name>\\nDescription:...     1.0          99   \n",
       "7   ID: 4432-0\\nName: <unknown name>\\nDescription:...     1.0          99   \n",
       "14  ID: 4664-0\\nName: <unknown name>\\nDescription:...     1.0          99   \n",
       "\n",
       "                                             features  \n",
       "6   [[115.131, 146.1451, 131.1736, 119.1197, 131.1...  \n",
       "7   [[115.131, 146.1451, 131.1736, 119.1197, 131.1...  \n",
       "14  [[115.131, 146.1451, 131.1736, 119.1197, 131.1...  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>89</th>\n",
       "      <th>90</th>\n",
       "      <th>91</th>\n",
       "      <th>92</th>\n",
       "      <th>93</th>\n",
       "      <th>94</th>\n",
       "      <th>95</th>\n",
       "      <th>96</th>\n",
       "      <th>97</th>\n",
       "      <th>98</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>115.131</td>\n",
       "      <td>146.1451</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>119.1197</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>204.2262</td>\n",
       "      <td>146.1451</td>\n",
       "      <td>174.2017</td>\n",
       "      <td>115.131</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>...</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>119.1197</td>\n",
       "      <td>146.1451</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>75.0669</td>\n",
       "      <td>121.159</td>\n",
       "      <td>119.1197</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>132.1184</td>\n",
       "      <td>165.19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>115.131</td>\n",
       "      <td>146.1451</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>119.1197</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>204.2262</td>\n",
       "      <td>146.1451</td>\n",
       "      <td>174.2017</td>\n",
       "      <td>115.131</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>...</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>119.1197</td>\n",
       "      <td>146.1451</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>75.0669</td>\n",
       "      <td>121.159</td>\n",
       "      <td>119.1197</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>132.1184</td>\n",
       "      <td>165.19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>115.131</td>\n",
       "      <td>146.1451</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>119.1197</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>204.2262</td>\n",
       "      <td>146.1451</td>\n",
       "      <td>174.2017</td>\n",
       "      <td>115.131</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>...</td>\n",
       "      <td>149.2124</td>\n",
       "      <td>119.1197</td>\n",
       "      <td>146.1451</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>75.0669</td>\n",
       "      <td>121.159</td>\n",
       "      <td>119.1197</td>\n",
       "      <td>131.1736</td>\n",
       "      <td>132.1184</td>\n",
       "      <td>165.19</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 99 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         0         1         2         3         4         5         6   \\\n",
       "6   115.131  146.1451  131.1736  119.1197  131.1736  204.2262  146.1451   \n",
       "7   115.131  146.1451  131.1736  119.1197  131.1736  204.2262  146.1451   \n",
       "14  115.131  146.1451  131.1736  119.1197  131.1736  204.2262  146.1451   \n",
       "\n",
       "          7        8         9   ...        89        90        91        92  \\\n",
       "6   174.2017  115.131  131.1736  ...  131.1736  119.1197  146.1451  131.1736   \n",
       "7   174.2017  115.131  131.1736  ...  131.1736  119.1197  146.1451  131.1736   \n",
       "14  174.2017  115.131  131.1736  ...  149.2124  119.1197  146.1451  131.1736   \n",
       "\n",
       "         93       94        95        96        97      98  \n",
       "6   75.0669  121.159  119.1197  131.1736  132.1184  165.19  \n",
       "7   75.0669  121.159  119.1197  131.1736  132.1184  165.19  \n",
       "14  75.0669  121.159  119.1197  131.1736  132.1184  165.19  \n",
       "\n",
       "[3 rows x 99 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define training functions\n",
    "\n",
    "When writing code to interface with Dask, a functional paradigm is often preferred. Hence, we will write the procedures that are needed inside functions that can be submitted by the `client` to the `cluster`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import featurize_sequence_, fit_model, cross_validate, predict"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, we'll scatter the data around the workers. `dataf` is named as such because this is the \"data futures\", a \"promise\" to the workers that `data` will exist for them and that they can access it. Likewise for `featuresf`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataf = client.scatter(data)\n",
    "featuresf = client.scatter(features)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, we fit the models, and collect their cross-validated scores."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "models = dict()\n",
    "scores = dict()\n",
    "\n",
    "\n",
    "for drug in drugs:\n",
    "    models[drug] = client.submit(fit_model, dataf, featuresf, drug)\n",
    "    scores[drug] = client.submit(cross_validate, dataf, featuresf, drug)\n",
    "    \n",
    "models = client.gather(models)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally, let's save the models. To save space on disk, we will pickle and gzip them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle as pkl\n",
    "import gzip\n",
    "\n",
    "for name, model in models.items():\n",
    "    with gzip.open(f\"data/models/{name}.pkl.gz\", 'wb') as f:\n",
    "        pkl.dump(model, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = client.gather(scores)\n",
    "with gzip.open(\"data/scores.pkl.gz\", \"wb\") as f:\n",
    "    pkl.dump(scores, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "minimal-panel",
   "language": "python",
   "name": "minimal-panel"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}