{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Lale: Type-Driven Auto-ML with Scikit-Learn\n",
    "\n",
    "### https://github.com/ibm/lale\n",
    "\n",
    "### Example Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: liac-arff>=2.4.0 in /home/hirzel/python3.6venv/lib/python3.6/site-packages (2.4.0)\r\n"
     ]
    }
   ],
   "source": [
    "!pip install 'liac-arff>=2.4.0'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>y</th>\n",
       "      <th>checking_status</th>\n",
       "      <th>duration</th>\n",
       "      <th>credit_history</th>\n",
       "      <th>purpose</th>\n",
       "      <th>credit_amount</th>\n",
       "      <th>savings_status</th>\n",
       "      <th>employment</th>\n",
       "      <th>installment_commitment</th>\n",
       "      <th>personal_status</th>\n",
       "      <th>...</th>\n",
       "      <th>residence_since</th>\n",
       "      <th>property_magnitude</th>\n",
       "      <th>age</th>\n",
       "      <th>other_payment_plans</th>\n",
       "      <th>housing</th>\n",
       "      <th>existing_credits</th>\n",
       "      <th>job</th>\n",
       "      <th>num_dependents</th>\n",
       "      <th>own_telephone</th>\n",
       "      <th>foreign_worker</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>835</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>no credits/all paid</td>\n",
       "      <td>new car</td>\n",
       "      <td>1082.0</td>\n",
       "      <td>&lt;100</td>\n",
       "      <td>1&lt;=X&lt;4</td>\n",
       "      <td>4.0</td>\n",
       "      <td>male single</td>\n",
       "      <td>...</td>\n",
       "      <td>4.0</td>\n",
       "      <td>car</td>\n",
       "      <td>48.0</td>\n",
       "      <td>bank</td>\n",
       "      <td>own</td>\n",
       "      <td>2.0</td>\n",
       "      <td>skilled</td>\n",
       "      <td>1.0</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>192</th>\n",
       "      <td>0</td>\n",
       "      <td>0&lt;=X&lt;200</td>\n",
       "      <td>27.0</td>\n",
       "      <td>existing paid</td>\n",
       "      <td>business</td>\n",
       "      <td>3915.0</td>\n",
       "      <td>&lt;100</td>\n",
       "      <td>1&lt;=X&lt;4</td>\n",
       "      <td>4.0</td>\n",
       "      <td>male single</td>\n",
       "      <td>...</td>\n",
       "      <td>2.0</td>\n",
       "      <td>car</td>\n",
       "      <td>36.0</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>1.0</td>\n",
       "      <td>skilled</td>\n",
       "      <td>2.0</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>629</th>\n",
       "      <td>1</td>\n",
       "      <td>no checking</td>\n",
       "      <td>9.0</td>\n",
       "      <td>existing paid</td>\n",
       "      <td>education</td>\n",
       "      <td>3832.0</td>\n",
       "      <td>no known savings</td>\n",
       "      <td>&gt;=7</td>\n",
       "      <td>1.0</td>\n",
       "      <td>male single</td>\n",
       "      <td>...</td>\n",
       "      <td>4.0</td>\n",
       "      <td>real estate</td>\n",
       "      <td>64.0</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>1.0</td>\n",
       "      <td>unskilled resident</td>\n",
       "      <td>1.0</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>559</th>\n",
       "      <td>0</td>\n",
       "      <td>0&lt;=X&lt;200</td>\n",
       "      <td>18.0</td>\n",
       "      <td>critical/other existing credit</td>\n",
       "      <td>furniture/equipment</td>\n",
       "      <td>1928.0</td>\n",
       "      <td>&lt;100</td>\n",
       "      <td>&lt;1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>male single</td>\n",
       "      <td>...</td>\n",
       "      <td>2.0</td>\n",
       "      <td>real estate</td>\n",
       "      <td>31.0</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>2.0</td>\n",
       "      <td>unskilled resident</td>\n",
       "      <td>1.0</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684</th>\n",
       "      <td>1</td>\n",
       "      <td>0&lt;=X&lt;200</td>\n",
       "      <td>36.0</td>\n",
       "      <td>delayed previously</td>\n",
       "      <td>business</td>\n",
       "      <td>9857.0</td>\n",
       "      <td>100&lt;=X&lt;500</td>\n",
       "      <td>4&lt;=X&lt;7</td>\n",
       "      <td>1.0</td>\n",
       "      <td>male single</td>\n",
       "      <td>...</td>\n",
       "      <td>3.0</td>\n",
       "      <td>life insurance</td>\n",
       "      <td>31.0</td>\n",
       "      <td>none</td>\n",
       "      <td>own</td>\n",
       "      <td>2.0</td>\n",
       "      <td>unskilled resident</td>\n",
       "      <td>2.0</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     y checking_status  duration                  credit_history  \\\n",
       "835  0              <0      12.0             no credits/all paid   \n",
       "192  0        0<=X<200      27.0                   existing paid   \n",
       "629  1     no checking       9.0                   existing paid   \n",
       "559  0        0<=X<200      18.0  critical/other existing credit   \n",
       "684  1        0<=X<200      36.0              delayed previously   \n",
       "\n",
       "                 purpose  credit_amount    savings_status employment  \\\n",
       "835              new car         1082.0              <100     1<=X<4   \n",
       "192             business         3915.0              <100     1<=X<4   \n",
       "629            education         3832.0  no known savings        >=7   \n",
       "559  furniture/equipment         1928.0              <100         <1   \n",
       "684             business         9857.0        100<=X<500     4<=X<7   \n",
       "\n",
       "     installment_commitment personal_status  ... residence_since  \\\n",
       "835                     4.0     male single  ...             4.0   \n",
       "192                     4.0     male single  ...             2.0   \n",
       "629                     1.0     male single  ...             4.0   \n",
       "559                     2.0     male single  ...             2.0   \n",
       "684                     1.0     male single  ...             3.0   \n",
       "\n",
       "     property_magnitude   age  other_payment_plans housing existing_credits  \\\n",
       "835                 car  48.0                 bank     own              2.0   \n",
       "192                 car  36.0                 none     own              1.0   \n",
       "629         real estate  64.0                 none     own              1.0   \n",
       "559         real estate  31.0                 none     own              2.0   \n",
       "684      life insurance  31.0                 none     own              2.0   \n",
       "\n",
       "                    job num_dependents  own_telephone foreign_worker  \n",
       "835             skilled            1.0           none            yes  \n",
       "192             skilled            2.0            yes            yes  \n",
       "629  unskilled resident            1.0           none            yes  \n",
       "559  unskilled resident            1.0           none            yes  \n",
       "684  unskilled resident            2.0            yes            yes  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import lale.datasets.openml\n",
    "import pandas as pd\n",
    "(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(\n",
    "    'credit-g', 'classification', preprocess=False)\n",
    "pd.concat([pd.DataFrame({'y': train_y}, index=train_X.index).tail(),\n",
    "           train_X.tail()], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Algorithm Selection and Hyperparameter Tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import Normalizer as Norm\n",
    "from sklearn.preprocessing import OneHotEncoder as OneHot\n",
    "from sklearn.linear_model import LogisticRegression as LR\n",
    "from xgboost import XGBClassifier as XGBoost\n",
    "from sklearn.svm import LinearSVC\n",
    "from lale.operators import make_pipeline, make_union\n",
    "from lale.lib.lale import Project, ConcatFeatures, NoOp\n",
    "lale.wrap_imported_operators()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
       " -->\n",
       "<!-- Title: cluster:(root) Pages: 1 -->\n",
       "<svg width=\"366pt\" height=\"244pt\"\n",
       " viewBox=\"0.00 0.00 365.87 244.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 240)\">\n",
       "<title>cluster:(root)</title>\n",
       "<g id=\"a_graph0\"><a xlink:title=\"(root) = ...\">\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-240 361.865,-240 361.865,4 -4,4\"/>\n",
       "</a>\n",
       "</g>\n",
       "<g id=\"clust1\" class=\"cluster\"><title>cluster:choice_0</title>\n",
       "<g id=\"a_clust1\"><a xlink:title=\"choice_0 = norm | no_op\">\n",
       "<polygon fill=\"#7ec0ee\" stroke=\"black\" points=\"82,-106 82,-228 152,-228 152,-106 82,-106\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-212.8\" font-family=\"Times,serif\" font-size=\"14.00\">Choice</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<g id=\"clust2\" class=\"cluster\"><title>cluster:choice_1</title>\n",
       "<g id=\"a_clust2\"><a xlink:title=\"choice_1 = lr | linear_svc | xg_boost\">\n",
       "<polygon fill=\"#7ec0ee\" stroke=\"black\" points=\"274.468,-8 274.468,-177 349.865,-177 349.865,-8 274.468,-8\"/>\n",
       "<text text-anchor=\"middle\" x=\"312.167\" y=\"-161.8\" font-family=\"Times,serif\" font-size=\"14.00\">Choice</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_0 -->\n",
       "<g id=\"node1\" class=\"node\"><title>project_0</title>\n",
       "<g id=\"a_node1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_0 = Project(columns={&#39;type&#39;: &#39;number&#39;})\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"27\" cy=\"-179\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-176.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- norm -->\n",
       "<g id=\"node2\" class=\"node\"><title>norm</title>\n",
       "<g id=\"a_node2\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html\" xlink:title=\"norm = Norm\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"117\" cy=\"-179\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-176.2\" font-family=\"Times,serif\" font-size=\"11.00\">Norm</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_0&#45;&gt;norm -->\n",
       "<g id=\"edge1\" class=\"edge\"><title>project_0&#45;&gt;norm</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M54.4029,-179C62.3932,-179 71.3106,-179 79.8241,-179\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"72.0002,-182.5 82,-179 71.9998,-175.5 72.0002,-182.5\"/>\n",
       "</g>\n",
       "<!-- concat_features -->\n",
       "<g id=\"node6\" class=\"node\"><title>concat_features</title>\n",
       "<g id=\"a_node6\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.concat_features.html\" xlink:title=\"concat_features = ConcatFeatures\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"213.234\" cy=\"-128\" rx=\"33.4697\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"213.234\" y=\"-131.2\" font-family=\"Times,serif\" font-size=\"11.00\">Concat&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"213.234\" y=\"-119.2\" font-family=\"Times,serif\" font-size=\"11.00\">Features</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- norm&#45;&gt;concat_features -->\n",
       "<g id=\"edge3\" class=\"edge\"><title>norm&#45;&gt;concat_features</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M151.682,-160.771C160.603,-155.943 170.224,-150.736 179.114,-145.925\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"181.031,-148.867 188.159,-141.029 177.699,-142.711 181.031,-148.867\"/>\n",
       "</g>\n",
       "<!-- no_op -->\n",
       "<g id=\"node3\" class=\"node\"><title>no_op</title>\n",
       "<g id=\"a_node3\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.no_op.html\" xlink:title=\"no_op = NoOp\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"117\" cy=\"-134\" rx=\"27\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-137.2\" font-family=\"Times,serif\" font-size=\"11.00\">No&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-125.2\" font-family=\"Times,serif\" font-size=\"11.00\">Op</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_1 -->\n",
       "<g id=\"node4\" class=\"node\"><title>project_1</title>\n",
       "<g id=\"a_node4\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_1 = Project(columns={&#39;type&#39;: &#39;string&#39;})\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"27\" cy=\"-78\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-75.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- one_hot -->\n",
       "<g id=\"node5\" class=\"node\"><title>one_hot</title>\n",
       "<g id=\"a_node5\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html\" xlink:title=\"one_hot = OneHot\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"117\" cy=\"-78\" rx=\"27\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-81.2\" font-family=\"Times,serif\" font-size=\"11.00\">One&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-69.2\" font-family=\"Times,serif\" font-size=\"11.00\">Hot</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_1&#45;&gt;one_hot -->\n",
       "<g id=\"edge2\" class=\"edge\"><title>project_1&#45;&gt;one_hot</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M54.4029,-78C62.3932,-78 71.3106,-78 79.8241,-78\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"79.919,-81.5001 89.919,-78 79.919,-74.5001 79.919,-81.5001\"/>\n",
       "</g>\n",
       "<!-- one_hot&#45;&gt;concat_features -->\n",
       "<g id=\"edge4\" class=\"edge\"><title>one_hot&#45;&gt;concat_features</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M139.383,-89.3458C151.006,-95.513 165.644,-103.279 178.744,-110.23\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"177.471,-113.517 187.945,-115.113 180.752,-107.334 177.471,-113.517\"/>\n",
       "</g>\n",
       "<!-- lr -->\n",
       "<g id=\"node7\" class=\"node\"><title>lr</title>\n",
       "<g id=\"a_node7\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.logistic_regression.html\" xlink:title=\"lr = LR\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"312.167\" cy=\"-128\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"312.167\" y=\"-125.2\" font-family=\"Times,serif\" font-size=\"11.00\">LR</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- concat_features&#45;&gt;lr -->\n",
       "<g id=\"edge5\" class=\"edge\"><title>concat_features&#45;&gt;lr</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M246.511,-128C252.356,-128 258.555,-128 264.711,-128\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"264.961,-131.5 274.96,-128 264.96,-124.5 264.961,-131.5\"/>\n",
       "</g>\n",
       "<!-- linear_svc -->\n",
       "<g id=\"node8\" class=\"node\"><title>linear_svc</title>\n",
       "<g id=\"a_node8\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html\" xlink:title=\"linear_svc = LinearSVC(dual=False)\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"312.167\" cy=\"-83\" rx=\"29.8983\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"312.167\" y=\"-86.2\" font-family=\"Times,serif\" font-size=\"11.00\">Linear&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"312.167\" y=\"-74.2\" font-family=\"Times,serif\" font-size=\"11.00\">SVC</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- xg_boost -->\n",
       "<g id=\"node9\" class=\"node\"><title>xg_boost</title>\n",
       "<g id=\"a_node9\"><a xlink:href=\"https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn\" xlink:title=\"xg_boost = XGBoost\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"312.167\" cy=\"-36\" rx=\"27\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"312.167\" y=\"-39.2\" font-family=\"Times,serif\" font-size=\"11.00\">XG&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"312.167\" y=\"-27.2\" font-family=\"Times,serif\" font-size=\"11.00\">Boost</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x7fc027c02080>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "project_nums = Project(columns={'type': 'number'})\n",
    "project_cats = Project(columns={'type': 'string'})\n",
    "planned_pipeline = (\n",
    "       (project_nums >> (Norm | NoOp) & project_cats >> OneHot)\n",
    "    >> ConcatFeatures\n",
    "    >> (LR | LinearSVC(dual=False)| XGBoost))\n",
    "planned_pipeline.visualize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100%|████████████| 5/5 [00:47<00:00,  9.36s/it, best loss: -0.7507273649370062]\n",
      "accuracy 72.1%\n"
     ]
    }
   ],
   "source": [
    "import sklearn.metrics\n",
    "from lale.lib.lale import Hyperopt\n",
    "auto_optimizer = Hyperopt(estimator=planned_pipeline, cv=3, max_evals=5)\n",
    "auto_trained = auto_optimizer.fit(train_X, train_y)\n",
    "auto_y = auto_trained.predict(test_X)\n",
    "print(f'accuracy {sklearn.metrics.accuracy_score(test_y, auto_y):.1%}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Displaying Automation Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
       " -->\n",
       "<!-- Title: cluster:(root) Pages: 1 -->\n",
       "<svg width=\"350pt\" height=\"91pt\"\n",
       " viewBox=\"0.00 0.00 349.87 90.80\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 86.799)\">\n",
       "<title>cluster:(root)</title>\n",
       "<g id=\"a_graph0\"><a xlink:title=\"(root) = ...\">\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-86.799 345.865,-86.799 345.865,4 -4,4\"/>\n",
       "</a>\n",
       "</g>\n",
       "<!-- project_0 -->\n",
       "<g id=\"node1\" class=\"node\"><title>project_0</title>\n",
       "<g id=\"a_node1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_0 = Project(columns={&#39;type&#39;: &#39;number&#39;})\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"27\" cy=\"-64.799\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-61.999\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- norm -->\n",
       "<g id=\"node2\" class=\"node\"><title>norm</title>\n",
       "<g id=\"a_node2\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html\" xlink:title=\"norm = Norm(norm=&#39;l1&#39;)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"117\" cy=\"-64.799\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-61.999\" font-family=\"Times,serif\" font-size=\"11.00\">Norm</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_0&#45;&gt;norm -->\n",
       "<g id=\"edge1\" class=\"edge\"><title>project_0&#45;&gt;norm</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M54.4029,-64.799C62.3932,-64.799 71.3106,-64.799 79.8241,-64.799\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"79.919,-68.2991 89.919,-64.799 79.919,-61.2991 79.919,-68.2991\"/>\n",
       "</g>\n",
       "<!-- concat_features -->\n",
       "<g id=\"node5\" class=\"node\"><title>concat_features</title>\n",
       "<g id=\"a_node5\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.concat_features.html\" xlink:title=\"concat_features = ConcatFeatures()\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"213.234\" cy=\"-41.799\" rx=\"33.4697\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"213.234\" y=\"-44.999\" font-family=\"Times,serif\" font-size=\"11.00\">Concat&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"213.234\" y=\"-32.999\" font-family=\"Times,serif\" font-size=\"11.00\">Features</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- norm&#45;&gt;concat_features -->\n",
       "<g id=\"edge3\" class=\"edge\"><title>norm&#45;&gt;concat_features</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M142.759,-58.756C151.838,-56.54 162.36,-53.972 172.425,-51.5153\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"173.383,-54.8843 182.268,-49.1129 171.723,-48.0839 173.383,-54.8843\"/>\n",
       "</g>\n",
       "<!-- project_1 -->\n",
       "<g id=\"node3\" class=\"node\"><title>project_1</title>\n",
       "<g id=\"a_node3\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_1 = Project(columns={&#39;type&#39;: &#39;string&#39;})\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"27\" cy=\"-19.799\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-16.999\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- one_hot -->\n",
       "<g id=\"node4\" class=\"node\"><title>one_hot</title>\n",
       "<g id=\"a_node4\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html\" xlink:title=\"one_hot = OneHot()\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"117\" cy=\"-19.799\" rx=\"27\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-22.999\" font-family=\"Times,serif\" font-size=\"11.00\">One&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-10.999\" font-family=\"Times,serif\" font-size=\"11.00\">Hot</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_1&#45;&gt;one_hot -->\n",
       "<g id=\"edge2\" class=\"edge\"><title>project_1&#45;&gt;one_hot</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M54.4029,-19.799C62.3932,-19.799 71.3106,-19.799 79.8241,-19.799\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"79.919,-23.2991 89.919,-19.799 79.919,-16.2991 79.919,-23.2991\"/>\n",
       "</g>\n",
       "<!-- one_hot&#45;&gt;concat_features -->\n",
       "<g id=\"edge4\" class=\"edge\"><title>one_hot&#45;&gt;concat_features</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M143.253,-25.6946C152.074,-27.7539 162.208,-30.1199 171.945,-32.3931\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"171.343,-35.8465 181.877,-34.7117 172.934,-29.0298 171.343,-35.8465\"/>\n",
       "</g>\n",
       "<!-- linear_svc -->\n",
       "<g id=\"node6\" class=\"node\"><title>linear_svc</title>\n",
       "<g id=\"a_node6\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html\" xlink:title=\"linear_svc = LinearSVC(dual=False, C=9773.459065896624, tol=0.0006905227182226334)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"312.167\" cy=\"-41.799\" rx=\"29.8983\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"312.167\" y=\"-44.999\" font-family=\"Times,serif\" font-size=\"11.00\">Linear&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"312.167\" y=\"-32.999\" font-family=\"Times,serif\" font-size=\"11.00\">SVC</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- concat_features&#45;&gt;linear_svc -->\n",
       "<g id=\"edge5\" class=\"edge\"><title>concat_features&#45;&gt;linear_svc</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M246.511,-41.799C254.733,-41.799 263.658,-41.799 272.176,-41.799\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"272.291,-45.2991 282.291,-41.799 272.291,-38.2991 272.291,-45.2991\"/>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x7fc0295f7a20>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "best_pipeline = auto_trained.get_pipeline()\n",
    "best_pipeline.visualize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "```python\n",
       "project_0 = Project(columns={'type': 'number'})\n",
       "norm = Norm(norm='l1')\n",
       "project_1 = Project(columns={'type': 'string'})\n",
       "linear_svc = LinearSVC(dual=False, C=9773.459065896624, tol=0.0006905227182226334)\n",
       "pipeline = ((project_0 >> norm) & (project_1 >> OneHot())) >> ConcatFeatures() >> linear_svc\n",
       "```"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from lale.pretty_print import ipython_display\n",
    "ipython_display(best_pipeline, show_imports=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### JSON Schemas\n",
    "\n",
    "https://json-schema.org/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "```python\n",
       "{\n",
       "    'description': 'Number of trees to fit.',\n",
       "    'type': 'integer',\n",
       "    'default': 100,\n",
       "    'minimumForOptimizer': 10,\n",
       "    'maximumForOptimizer': 1500}\n",
       "```"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ipython_display(XGBoost.hyperparam_schema('n_estimators'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "```python\n",
       "{\n",
       "    'description': 'Specify which booster to use.',\n",
       "    'enum': ['gbtree', 'gblinear', 'dart'],\n",
       "    'default': 'gbtree'}\n",
       "```"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ipython_display(XGBoost.hyperparam_schema('booster'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Invalid configuration for XGBoost(n_estimators=0.5, booster='gbtree') due to invalid value n_estimators=0.5.\n",
      "Schema of argument n_estimators: {\n",
      "    'description': 'Number of trees to fit.',\n",
      "    'type': 'integer',\n",
      "    'default': 100,\n",
      "    'minimumForOptimizer': 10,\n",
      "    'maximumForOptimizer': 1500}\n",
      "Value: 0.5\n"
     ]
    }
   ],
   "source": [
    "import jsonschema\n",
    "import sys\n",
    "try:\n",
    "    XGBoost(n_estimators=0.5, booster='gbtree')\n",
    "except jsonschema.ValidationError as e:\n",
    "    print(e.message, file=sys.stderr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Customizing Schemas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import lale.schemas as schemas\n",
    "Grove = XGBoost.customize_schema(\n",
    "    n_estimators=schemas.Int(min=2, max=10),\n",
    "    booster=schemas.Enum(['gbtree']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "grove_planned = ( Project(columns={'type': 'number'}) >> Norm\n",
    "                & Project(columns={'type': 'string'}) >> OneHot\n",
    "                ) >> ConcatFeatures >> Grove"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [01:16<00:00,  7.70s/it, best loss: -0.7418455050181508]\n",
      "accuracy 75.5%\n"
     ]
    }
   ],
   "source": [
    "grove_optimizer = Hyperopt(estimator=grove_planned, cv=3, max_evals=10)\n",
    "grove_trained = grove_optimizer.fit(train_X, train_y)\n",
    "grove_y = grove_trained.predict(test_X)\n",
    "print(f'accuracy {sklearn.metrics.accuracy_score(test_y, grove_y):.1%}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "```python\n",
       "project_0 = Project(columns={'type': 'number'})\n",
       "norm = Norm(norm='max')\n",
       "project_1 = Project(columns={'type': 'string'})\n",
       "xg_boost = XGBoost(booster='gbtree', colsample_bylevel=0.9207359753561951, colsample_bytree=0.8853673179789476, learning_rate=0.7582425275075225, min_child_weight=11, n_estimators=8, reg_alpha=0.5980470775121279, reg_lambda=0.2546844052569046, subsample=0.8142720284737895)\n",
       "pipeline = ((project_0 >> norm) & (project_1 >> OneHot())) >> ConcatFeatures() >> xg_boost\n",
       "```"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "grove_best = grove_trained.get_pipeline()\n",
    "ipython_display(grove_best, show_imports=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Human-in-the-Loop Auto-ML\n",
    "\n",
    "### https://github.com/ibm/lale"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"img/2019-1208-loops.png\" style=\"width:700px\" align=\"left\">"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}