{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "shapes: train_X (4838, 5), test_X (53766, 5)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>drugName</th>\n",
       "      <th>condition</th>\n",
       "      <th>review</th>\n",
       "      <th>date</th>\n",
       "      <th>usefulCount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3969</th>\n",
       "      <td>Citalopram</td>\n",
       "      <td>Depression</td>\n",
       "      <td>\"Celexa is a literally a magic pill, for me. 2...</td>\n",
       "      <td>December 8, 2014</td>\n",
       "      <td>48</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4926</th>\n",
       "      <td>Prevacid</td>\n",
       "      <td>GERD</td>\n",
       "      <td>\"I have tried almost every acid reducer made i...</td>\n",
       "      <td>January 16, 2016</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>80152</th>\n",
       "      <td>Aubra</td>\n",
       "      <td>Birth Control</td>\n",
       "      <td>\"I hate this pill, extreme depression, breast ...</td>\n",
       "      <td>December 4, 2016</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79507</th>\n",
       "      <td>Microgestin Fe 1.5 / 30</td>\n",
       "      <td>Abnormal Uterine Bleeding</td>\n",
       "      <td>\"I had been on my period for 3months. The micr...</td>\n",
       "      <td>June 27, 2016</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>130035</th>\n",
       "      <td>Azurette</td>\n",
       "      <td>Birth Control</td>\n",
       "      <td>\"Effective at preventing pregnancy. My skin wa...</td>\n",
       "      <td>October 14, 2013</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       drugName                  condition  \\\n",
       "3969                 Citalopram                 Depression   \n",
       "4926                   Prevacid                       GERD   \n",
       "80152                     Aubra              Birth Control   \n",
       "79507   Microgestin Fe 1.5 / 30  Abnormal Uterine Bleeding   \n",
       "130035                 Azurette              Birth Control   \n",
       "\n",
       "                                                   review              date  \\\n",
       "3969    \"Celexa is a literally a magic pill, for me. 2...  December 8, 2014   \n",
       "4926    \"I have tried almost every acid reducer made i...  January 16, 2016   \n",
       "80152   \"I hate this pill, extreme depression, breast ...  December 4, 2016   \n",
       "79507   \"I had been on my period for 3months. The micr...     June 27, 2016   \n",
       "130035  \"Effective at preventing pregnancy. My skin wa...  October 14, 2013   \n",
       "\n",
       "        usefulCount  \n",
       "3969             48  \n",
       "4926             33  \n",
       "80152             5  \n",
       "79507             6  \n",
       "130035           19  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from lale.datasets.uci.uci_datasets import fetch_drugscom\n",
    "from sklearn.model_selection import train_test_split\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "import lale.schema2enums\n",
    "train_X_all, train_y_all, test_X, test_y = fetch_drugscom()\n",
    "#subset 3% = 4,838 rows to speed up experimentation\n",
    "train_X, train_X_ignore, train_y, train_y_ignore = train_test_split(\n",
    "    train_X_all, train_y_all, train_size=0.03, random_state=42)\n",
    "print(f'shapes: train_X {train_X.shape}, test_X {test_X.shape}')\n",
    "train_X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
       " -->\n",
       "<!-- Title: cluster:(root) Pages: 1 -->\n",
       "<svg width=\"332pt\" height=\"87pt\"\n",
       " viewBox=\"0.00 0.00 332.00 87.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 83)\">\n",
       "<title>cluster:(root)</title>\n",
       "<g id=\"a_graph0\"><a xlink:title=\"(root) = ...\">\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-83 328,-83 328,4 -4,4\"/>\n",
       "</a>\n",
       "</g>\n",
       "<!-- project_0 -->\n",
       "<g id=\"node1\" class=\"node\"><title>project_0</title>\n",
       "<g id=\"a_node1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_0 = Project(columns=[&#39;review&#39;])\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"27\" cy=\"-61\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-58.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- tfidf -->\n",
       "<g id=\"node2\" class=\"node\"><title>tfidf</title>\n",
       "<g id=\"a_node2\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\" xlink:title=\"tfidf = Tfidf(max_features=100)\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"117\" cy=\"-61\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-58.2\" font-family=\"Times,serif\" font-size=\"11.00\">Tfidf</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_0&#45;&gt;tfidf -->\n",
       "<g id=\"edge1\" class=\"edge\"><title>project_0&#45;&gt;tfidf</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M54.4029,-61C62.3932,-61 71.3106,-61 79.8241,-61\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"79.919,-64.5001 89.919,-61 79.919,-57.5001 79.919,-64.5001\"/>\n",
       "</g>\n",
       "<!-- cat -->\n",
       "<g id=\"node4\" class=\"node\"><title>cat</title>\n",
       "<g id=\"a_node4\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.concat_features.html\" xlink:title=\"cat = Cat\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"207\" cy=\"-39\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"207\" y=\"-36.2\" font-family=\"Times,serif\" font-size=\"11.00\">Cat</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- tfidf&#45;&gt;cat -->\n",
       "<g id=\"edge2\" class=\"edge\"><title>tfidf&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M142.513,-54.8718C151.496,-52.6259 161.838,-50.0406 171.531,-47.6173\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"172.461,-50.9926 181.313,-45.1717 170.763,-44.2016 172.461,-50.9926\"/>\n",
       "</g>\n",
       "<!-- project_1 -->\n",
       "<g id=\"node3\" class=\"node\"><title>project_1</title>\n",
       "<g id=\"a_node3\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_1 = Project(columns={&#39;type&#39;: &#39;number&#39;})\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"117\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-15.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_1&#45;&gt;cat -->\n",
       "<g id=\"edge3\" class=\"edge\"><title>project_1&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M142.982,-23.9616C151.789,-26.0632 161.854,-28.4651 171.317,-30.7233\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"170.721,-34.1793 181.26,-33.0962 172.346,-27.3705 170.721,-34.1793\"/>\n",
       "</g>\n",
       "<!-- lin_r -->\n",
       "<g id=\"node5\" class=\"node\"><title>lin_r</title>\n",
       "<g id=\"a_node5\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html\" xlink:title=\"lin_r = LinR()\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"297\" cy=\"-39\" rx=\"27\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"297\" y=\"-42.2\" font-family=\"Times,serif\" font-size=\"11.00\">Lin&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"297\" y=\"-30.2\" font-family=\"Times,serif\" font-size=\"11.00\">R</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- cat&#45;&gt;lin_r -->\n",
       "<g id=\"edge4\" class=\"edge\"><title>cat&#45;&gt;lin_r</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M234.403,-39C242.393,-39 251.311,-39 259.824,-39\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"259.919,-42.5001 269.919,-39 259.919,-35.5001 259.919,-42.5001\"/>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x7fa34bee2f98>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from lale.lib.lale import Project\n",
    "from lale.lib.lale import ConcatFeatures as Cat\n",
    "from lale.lib.sklearn import TfidfVectorizer as Tfidf\n",
    "from lale.lib.sklearn import LinearRegression as LinR\n",
    "trainable = (\n",
    "        Project(columns=['review']) >> Tfidf(max_features=100)\n",
    "      & Project(columns={'type': 'number'})\n",
    "    ) >> Cat >> LinR()\n",
    "trainable.visualize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
       " -->\n",
       "<!-- Title: cluster:(root) Pages: 1 -->\n",
       "<svg width=\"332pt\" height=\"87pt\"\n",
       " viewBox=\"0.00 0.00 332.00 87.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 83)\">\n",
       "<title>cluster:(root)</title>\n",
       "<g id=\"a_graph0\"><a xlink:title=\"(root) = ...\">\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-83 328,-83 328,4 -4,4\"/>\n",
       "</a>\n",
       "</g>\n",
       "<!-- project_0 -->\n",
       "<g id=\"node1\" class=\"node\"><title>project_0</title>\n",
       "<g id=\"a_node1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_0 = Project(columns=[&#39;review&#39;])\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"27\" cy=\"-61\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-58.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- tfidf -->\n",
       "<g id=\"node2\" class=\"node\"><title>tfidf</title>\n",
       "<g id=\"a_node2\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\" xlink:title=\"tfidf = Tfidf(max_features=100)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"117\" cy=\"-61\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-58.2\" font-family=\"Times,serif\" font-size=\"11.00\">Tfidf</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_0&#45;&gt;tfidf -->\n",
       "<g id=\"edge1\" class=\"edge\"><title>project_0&#45;&gt;tfidf</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M54.4029,-61C62.3932,-61 71.3106,-61 79.8241,-61\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"79.919,-64.5001 89.919,-61 79.919,-57.5001 79.919,-64.5001\"/>\n",
       "</g>\n",
       "<!-- cat -->\n",
       "<g id=\"node4\" class=\"node\"><title>cat</title>\n",
       "<g id=\"a_node4\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.concat_features.html\" xlink:title=\"cat = Cat\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"207\" cy=\"-39\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"207\" y=\"-36.2\" font-family=\"Times,serif\" font-size=\"11.00\">Cat</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- tfidf&#45;&gt;cat -->\n",
       "<g id=\"edge2\" class=\"edge\"><title>tfidf&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M142.513,-54.8718C151.496,-52.6259 161.838,-50.0406 171.531,-47.6173\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"172.461,-50.9926 181.313,-45.1717 170.763,-44.2016 172.461,-50.9926\"/>\n",
       "</g>\n",
       "<!-- project_1 -->\n",
       "<g id=\"node3\" class=\"node\"><title>project_1</title>\n",
       "<g id=\"a_node3\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_1 = Project(columns={&#39;type&#39;: &#39;number&#39;})\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"117\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-15.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_1&#45;&gt;cat -->\n",
       "<g id=\"edge3\" class=\"edge\"><title>project_1&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M142.982,-23.9616C151.789,-26.0632 161.854,-28.4651 171.317,-30.7233\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"170.721,-34.1793 181.26,-33.0962 172.346,-27.3705 170.721,-34.1793\"/>\n",
       "</g>\n",
       "<!-- lin_r -->\n",
       "<g id=\"node5\" class=\"node\"><title>lin_r</title>\n",
       "<g id=\"a_node5\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html\" xlink:title=\"lin_r = LinR()\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"297\" cy=\"-39\" rx=\"27\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"297\" y=\"-42.2\" font-family=\"Times,serif\" font-size=\"11.00\">Lin&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"297\" y=\"-30.2\" font-family=\"Times,serif\" font-size=\"11.00\">R</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- cat&#45;&gt;lin_r -->\n",
       "<g id=\"edge4\" class=\"edge\"><title>cat&#45;&gt;lin_r</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M234.403,-39C242.393,-39 251.311,-39 259.824,-39\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"259.919,-42.5001 269.919,-39 259.919,-35.5001 259.919,-42.5001\"/>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x7fa34a5b5f28>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "trained = trainable.fit(train_X, train_y)\n",
    "predicted = trained.predict(test_X)\n",
    "trained.visualize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "comparisons [10:7.8, 8:9.1, 9:5.7, 9:7.4, 9:5.8, 4:7.6, 6:5.4, 9:8.4, 7:5.9, 2:6.5]\n",
      "RMSE 3.00\n"
     ]
    }
   ],
   "source": [
    "import math\n",
    "from sklearn.metrics import mean_squared_error\n",
    "comparisons = [f'{int(test_y[i])}:{predicted[i]:.1f}' for i in range(10)]\n",
    "print(f'comparisons [{\", \".join(comparisons)}]')\n",
    "rmse = math.sqrt(mean_squared_error(predicted, test_y))\n",
    "print(f'RMSE {rmse:.2f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
       " -->\n",
       "<!-- Title: cluster:(root) Pages: 1 -->\n",
       "<svg width=\"332pt\" height=\"87pt\"\n",
       " viewBox=\"0.00 0.00 332.00 87.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 83)\">\n",
       "<title>cluster:(root)</title>\n",
       "<g id=\"a_graph0\"><a xlink:title=\"(root) = ...\">\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-83 328,-83 328,4 -4,4\"/>\n",
       "</a>\n",
       "</g>\n",
       "<!-- project_0 -->\n",
       "<g id=\"node1\" class=\"node\"><title>project_0</title>\n",
       "<g id=\"a_node1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_0 = Project(columns=[&#39;review&#39;])\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"27\" cy=\"-61\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-58.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- tfidf -->\n",
       "<g id=\"node2\" class=\"node\"><title>tfidf</title>\n",
       "<g id=\"a_node2\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\" xlink:title=\"tfidf = Tfidf(max_features=100)\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"117\" cy=\"-61\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-58.2\" font-family=\"Times,serif\" font-size=\"11.00\">Tfidf</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_0&#45;&gt;tfidf -->\n",
       "<g id=\"edge1\" class=\"edge\"><title>project_0&#45;&gt;tfidf</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M54.4029,-61C62.3932,-61 71.3106,-61 79.8241,-61\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"79.919,-64.5001 89.919,-61 79.919,-57.5001 79.919,-64.5001\"/>\n",
       "</g>\n",
       "<!-- cat -->\n",
       "<g id=\"node4\" class=\"node\"><title>cat</title>\n",
       "<g id=\"a_node4\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.concat_features.html\" xlink:title=\"cat = Cat\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"207\" cy=\"-39\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"207\" y=\"-36.2\" font-family=\"Times,serif\" font-size=\"11.00\">Cat</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- tfidf&#45;&gt;cat -->\n",
       "<g id=\"edge2\" class=\"edge\"><title>tfidf&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M142.513,-54.8718C151.496,-52.6259 161.838,-50.0406 171.531,-47.6173\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"172.461,-50.9926 181.313,-45.1717 170.763,-44.2016 172.461,-50.9926\"/>\n",
       "</g>\n",
       "<!-- project_1 -->\n",
       "<g id=\"node3\" class=\"node\"><title>project_1</title>\n",
       "<g id=\"a_node3\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_1 = Project(columns={&#39;type&#39;: &#39;number&#39;})\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"117\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-15.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_1&#45;&gt;cat -->\n",
       "<g id=\"edge3\" class=\"edge\"><title>project_1&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M142.982,-23.9616C151.789,-26.0632 161.854,-28.4651 171.317,-30.7233\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"170.721,-34.1793 181.26,-33.0962 172.346,-27.3705 170.721,-34.1793\"/>\n",
       "</g>\n",
       "<!-- xgb -->\n",
       "<g id=\"node5\" class=\"node\"><title>xgb</title>\n",
       "<g id=\"a_node5\"><a xlink:href=\"https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn\" xlink:title=\"xgb = XGB()\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"297\" cy=\"-39\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"297\" y=\"-36.2\" font-family=\"Times,serif\" font-size=\"11.00\">XGB</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- cat&#45;&gt;xgb -->\n",
       "<g id=\"edge4\" class=\"edge\"><title>cat&#45;&gt;xgb</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M234.403,-39C242.393,-39 251.311,-39 259.824,-39\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"259.919,-42.5001 269.919,-39 259.919,-35.5001 259.919,-42.5001\"/>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x7fa34a4002b0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from lale.lib.xgboost import XGBRegressor as XGB\n",
    "trainable = (\n",
    "        Project(columns=['review']) >> Tfidf(max_features=100)\n",
    "      & Project(columns={'type': 'number'})\n",
    "    ) >> Cat >> XGB()\n",
    "trainable.visualize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "comparisons [10:8.2, 8:7.8, 9:4.8, 9:7.4, 9:4.7, 4:7.1, 6:4.1, 9:7.4, 7:6.8, 2:6.2]\n",
      "RMSE 2.98\n"
     ]
    }
   ],
   "source": [
    "trained = trainable.fit(train_X, train_y)\n",
    "predicted = trained.predict(test_X)\n",
    "comparisons = [f'{int(test_y[i])}:{predicted[i]:.1f}' for i in range(10)]\n",
    "print(f'comparisons [{\", \".join(comparisons)}]')\n",
    "rmse = math.sqrt(mean_squared_error(predicted, test_y))\n",
    "print(f'RMSE {rmse:.2f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
       " -->\n",
       "<!-- Title: cluster:(root) Pages: 1 -->\n",
       "<svg width=\"332pt\" height=\"87pt\"\n",
       " viewBox=\"0.00 0.00 332.00 87.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 83)\">\n",
       "<title>cluster:(root)</title>\n",
       "<g id=\"a_graph0\"><a xlink:title=\"(root) = ...\">\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-83 328,-83 328,4 -4,4\"/>\n",
       "</a>\n",
       "</g>\n",
       "<!-- project_0 -->\n",
       "<g id=\"node1\" class=\"node\"><title>project_0</title>\n",
       "<g id=\"a_node1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_0 = Project(columns=[&#39;review&#39;])\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"27\" cy=\"-61\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-58.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- tfidf -->\n",
       "<g id=\"node2\" class=\"node\"><title>tfidf</title>\n",
       "<g id=\"a_node2\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\" xlink:title=\"tfidf = Tfidf(ngram_range=(1, 1), max_features=100)\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"117\" cy=\"-61\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-58.2\" font-family=\"Times,serif\" font-size=\"11.00\">Tfidf</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_0&#45;&gt;tfidf -->\n",
       "<g id=\"edge1\" class=\"edge\"><title>project_0&#45;&gt;tfidf</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M54.4029,-61C62.3932,-61 71.3106,-61 79.8241,-61\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"79.919,-64.5001 89.919,-61 79.919,-57.5001 79.919,-64.5001\"/>\n",
       "</g>\n",
       "<!-- cat -->\n",
       "<g id=\"node4\" class=\"node\"><title>cat</title>\n",
       "<g id=\"a_node4\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.concat_features.html\" xlink:title=\"cat = Cat\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"207\" cy=\"-39\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"207\" y=\"-36.2\" font-family=\"Times,serif\" font-size=\"11.00\">Cat</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- tfidf&#45;&gt;cat -->\n",
       "<g id=\"edge2\" class=\"edge\"><title>tfidf&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M142.513,-54.8718C151.496,-52.6259 161.838,-50.0406 171.531,-47.6173\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"172.461,-50.9926 181.313,-45.1717 170.763,-44.2016 172.461,-50.9926\"/>\n",
       "</g>\n",
       "<!-- project_1 -->\n",
       "<g id=\"node3\" class=\"node\"><title>project_1</title>\n",
       "<g id=\"a_node3\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_1 = Project(columns={&#39;type&#39;: &#39;number&#39;})\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"117\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-15.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_1&#45;&gt;cat -->\n",
       "<g id=\"edge3\" class=\"edge\"><title>project_1&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M142.982,-23.9616C151.789,-26.0632 161.854,-28.4651 171.317,-30.7233\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"170.721,-34.1793 181.26,-33.0962 172.346,-27.3705 170.721,-34.1793\"/>\n",
       "</g>\n",
       "<!-- forest -->\n",
       "<g id=\"node5\" class=\"node\"><title>forest</title>\n",
       "<g id=\"a_node5\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html\" xlink:title=\"forest = Forest\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"297\" cy=\"-39\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"297\" y=\"-36.2\" font-family=\"Times,serif\" font-size=\"11.00\">Forest</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- cat&#45;&gt;forest -->\n",
       "<g id=\"edge4\" class=\"edge\"><title>cat&#45;&gt;forest</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M234.403,-39C242.393,-39 251.311,-39 259.824,-39\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"259.919,-42.5001 269.919,-39 259.919,-35.5001 259.919,-42.5001\"/>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x7fa3d40cbb70>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from lale.lib.sklearn import RandomForestRegressor as Forest\n",
    "tfidf_hps = {**Tfidf.get_defaults(), 'max_features': 100, 'ngram_range': (1,1)}\n",
    "planned = (\n",
    "        Project(columns=['review']) >> Tfidf(**tfidf_hps)\n",
    "      & Project(columns={'type': 'number'})\n",
    "    ) >> Cat >> Forest\n",
    "planned.visualize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100%|███████████| 3/3 [01:32<00:00, 29.61s/it, best loss: -0.10521892976978495]\n",
      "RMSE 3.12\n"
     ]
    },
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
       " -->\n",
       "<!-- Title: cluster:(root) Pages: 1 -->\n",
       "<svg width=\"332pt\" height=\"87pt\"\n",
       " viewBox=\"0.00 0.00 332.00 87.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 83)\">\n",
       "<title>cluster:(root)</title>\n",
       "<g id=\"a_graph0\"><a xlink:title=\"(root) = ...\">\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-83 328,-83 328,4 -4,4\"/>\n",
       "</a>\n",
       "</g>\n",
       "<!-- project_0 -->\n",
       "<g id=\"node1\" class=\"node\"><title>project_0</title>\n",
       "<g id=\"a_node1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_0 = Project(columns=[&#39;review&#39;])\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"27\" cy=\"-61\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-58.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- tfidf -->\n",
       "<g id=\"node2\" class=\"node\"><title>tfidf</title>\n",
       "<g id=\"a_node2\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\" xlink:title=\"tfidf = Tfidf(ngram_range=(1, 1), max_features=100)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"117\" cy=\"-61\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-58.2\" font-family=\"Times,serif\" font-size=\"11.00\">Tfidf</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_0&#45;&gt;tfidf -->\n",
       "<g id=\"edge1\" class=\"edge\"><title>project_0&#45;&gt;tfidf</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M54.4029,-61C62.3932,-61 71.3106,-61 79.8241,-61\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"79.919,-64.5001 89.919,-61 79.919,-57.5001 79.919,-64.5001\"/>\n",
       "</g>\n",
       "<!-- cat -->\n",
       "<g id=\"node4\" class=\"node\"><title>cat</title>\n",
       "<g id=\"a_node4\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.concat_features.html\" xlink:title=\"cat = Cat()\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"207\" cy=\"-39\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"207\" y=\"-36.2\" font-family=\"Times,serif\" font-size=\"11.00\">Cat</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- tfidf&#45;&gt;cat -->\n",
       "<g id=\"edge2\" class=\"edge\"><title>tfidf&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M142.513,-54.8718C151.496,-52.6259 161.838,-50.0406 171.531,-47.6173\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"172.461,-50.9926 181.313,-45.1717 170.763,-44.2016 172.461,-50.9926\"/>\n",
       "</g>\n",
       "<!-- project_1 -->\n",
       "<g id=\"node3\" class=\"node\"><title>project_1</title>\n",
       "<g id=\"a_node3\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_1 = Project(columns={&#39;type&#39;: &#39;number&#39;})\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"117\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-15.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_1&#45;&gt;cat -->\n",
       "<g id=\"edge3\" class=\"edge\"><title>project_1&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M142.982,-23.9616C151.789,-26.0632 161.854,-28.4651 171.317,-30.7233\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"170.721,-34.1793 181.26,-33.0962 172.346,-27.3705 170.721,-34.1793\"/>\n",
       "</g>\n",
       "<!-- forest -->\n",
       "<g id=\"node5\" class=\"node\"><title>forest</title>\n",
       "<g id=\"a_node5\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html\" xlink:title=\"forest = Forest(bootstrap=False, max_depth=4, max_features=0.25040322158625117, min_samples_leaf=0.05283026007212067, min_samples_split=13, n_estimators=23)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"297\" cy=\"-39\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"297\" y=\"-36.2\" font-family=\"Times,serif\" font-size=\"11.00\">Forest</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- cat&#45;&gt;forest -->\n",
       "<g id=\"edge4\" class=\"edge\"><title>cat&#45;&gt;forest</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M234.403,-39C242.393,-39 251.311,-39 259.824,-39\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"259.919,-42.5001 269.919,-39 259.919,-35.5001 259.919,-42.5001\"/>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x7fa34a4cfa20>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from lale.lib.lale import Hyperopt\n",
    "best_estimator = planned.auto_configure(train_X, train_y, optimizer=Hyperopt, max_evals=3, scoring='r2')\n",
    "predicted = best_estimator.predict(test_X)\n",
    "rmse = math.sqrt(mean_squared_error(predicted, test_y))\n",
    "print(f'RMSE {rmse:.2f}')\n",
    "best_estimator.visualize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
       " -->\n",
       "<!-- Title: cluster:(root) Pages: 1 -->\n",
       "<svg width=\"332pt\" height=\"87pt\"\n",
       " viewBox=\"0.00 0.00 332.00 87.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 83)\">\n",
       "<title>cluster:(root)</title>\n",
       "<g id=\"a_graph0\"><a xlink:title=\"(root) = ...\">\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-83 328,-83 328,4 -4,4\"/>\n",
       "</a>\n",
       "</g>\n",
       "<!-- project_0 -->\n",
       "<g id=\"node1\" class=\"node\"><title>project_0</title>\n",
       "<g id=\"a_node1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_0 = Project(columns=[&#39;review&#39;])\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"27\" cy=\"-61\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-58.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- bert -->\n",
       "<g id=\"node2\" class=\"node\"><title>bert</title>\n",
       "<g id=\"a_node2\"><a xlink:title=\"bert = Bert(batch_size=126)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"117\" cy=\"-61\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-58.2\" font-family=\"Times,serif\" font-size=\"11.00\">Bert</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_0&#45;&gt;bert -->\n",
       "<g id=\"edge1\" class=\"edge\"><title>project_0&#45;&gt;bert</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M54.4029,-61C62.3932,-61 71.3106,-61 79.8241,-61\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"79.919,-64.5001 89.919,-61 79.919,-57.5001 79.919,-64.5001\"/>\n",
       "</g>\n",
       "<!-- cat -->\n",
       "<g id=\"node4\" class=\"node\"><title>cat</title>\n",
       "<g id=\"a_node4\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.concat_features.html\" xlink:title=\"cat = Cat\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"207\" cy=\"-39\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"207\" y=\"-36.2\" font-family=\"Times,serif\" font-size=\"11.00\">Cat</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- bert&#45;&gt;cat -->\n",
       "<g id=\"edge2\" class=\"edge\"><title>bert&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M142.513,-54.8718C151.496,-52.6259 161.838,-50.0406 171.531,-47.6173\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"172.461,-50.9926 181.313,-45.1717 170.763,-44.2016 172.461,-50.9926\"/>\n",
       "</g>\n",
       "<!-- project_1 -->\n",
       "<g id=\"node3\" class=\"node\"><title>project_1</title>\n",
       "<g id=\"a_node3\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_1 = Project(columns={&#39;type&#39;: &#39;number&#39;})\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"117\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-15.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_1&#45;&gt;cat -->\n",
       "<g id=\"edge3\" class=\"edge\"><title>project_1&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M142.982,-23.9616C151.789,-26.0632 161.854,-28.4651 171.317,-30.7233\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"170.721,-34.1793 181.26,-33.0962 172.346,-27.3705 170.721,-34.1793\"/>\n",
       "</g>\n",
       "<!-- lin_r -->\n",
       "<g id=\"node5\" class=\"node\"><title>lin_r</title>\n",
       "<g id=\"a_node5\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html\" xlink:title=\"lin_r = LinR()\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"297\" cy=\"-39\" rx=\"27\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"297\" y=\"-42.2\" font-family=\"Times,serif\" font-size=\"11.00\">Lin&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"297\" y=\"-30.2\" font-family=\"Times,serif\" font-size=\"11.00\">R</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- cat&#45;&gt;lin_r -->\n",
       "<g id=\"edge4\" class=\"edge\"><title>cat&#45;&gt;lin_r</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M234.403,-39C242.393,-39 251.311,-39 259.824,-39\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"259.919,-42.5001 269.919,-39 259.919,-35.5001 259.919,-42.5001\"/>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x7fa3346e09e8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from lale.lib.lale import Project\n",
    "from lale.lib.pytorch import BertPretrainedEncoder as Bert\n",
    "from lale.lib.lale import ConcatFeatures as Cat\n",
    "from lale.lib.sklearn import LinearRegression as LinR\n",
    "trainable = (\n",
    "        Project(columns=['review']) >> Bert(batch_size=126)\n",
    "      & Project(columns={'type': 'number'})\n",
    "    ) >> Cat >> LinR()\n",
    "trainable.visualize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
       " -->\n",
       "<!-- Title: cluster:(root) Pages: 1 -->\n",
       "<svg width=\"348pt\" height=\"192pt\"\n",
       " viewBox=\"0.00 0.00 348.00 192.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 188)\">\n",
       "<title>cluster:(root)</title>\n",
       "<g id=\"a_graph0\"><a xlink:title=\"(root) = ...\">\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-188 344,-188 344,4 -4,4\"/>\n",
       "</a>\n",
       "</g>\n",
       "<g id=\"clust1\" class=\"cluster\"><title>cluster:choice_0</title>\n",
       "<g id=\"a_clust1\"><a xlink:title=\"choice_0 = bert | tfidf\">\n",
       "<polygon fill=\"#7ec0ee\" stroke=\"black\" points=\"82,-58 82,-176 152,-176 152,-58 82,-58\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-160.8\" font-family=\"Times,serif\" font-size=\"14.00\">Choice</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<g id=\"clust2\" class=\"cluster\"><title>cluster:choice_1</title>\n",
       "<g id=\"a_clust2\"><a xlink:title=\"choice_1 = lin_r | xgb\">\n",
       "<polygon fill=\"#7ec0ee\" stroke=\"black\" points=\"262,-8 262,-130 332,-130 332,-8 262,-8\"/>\n",
       "<text text-anchor=\"middle\" x=\"297\" y=\"-114.8\" font-family=\"Times,serif\" font-size=\"14.00\">Choice</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_0 -->\n",
       "<g id=\"node1\" class=\"node\"><title>project_0</title>\n",
       "<g id=\"a_node1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_0 = Project(columns=[&#39;review&#39;])\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"27\" cy=\"-127\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"27\" y=\"-124.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- bert -->\n",
       "<g id=\"node2\" class=\"node\"><title>bert</title>\n",
       "<g id=\"a_node2\"><a xlink:title=\"bert = Bert\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"117\" cy=\"-127\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-124.2\" font-family=\"Times,serif\" font-size=\"11.00\">Bert</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_0&#45;&gt;bert -->\n",
       "<g id=\"edge1\" class=\"edge\"><title>project_0&#45;&gt;bert</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M54.4029,-127C62.3932,-127 71.3106,-127 79.8241,-127\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"72.0002,-130.5 82,-127 71.9998,-123.5 72.0002,-130.5\"/>\n",
       "</g>\n",
       "<!-- cat -->\n",
       "<g id=\"node5\" class=\"node\"><title>cat</title>\n",
       "<g id=\"a_node5\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.concat_features.html\" xlink:title=\"cat = Cat\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"207\" cy=\"-79\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"207\" y=\"-76.2\" font-family=\"Times,serif\" font-size=\"11.00\">Cat</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- bert&#45;&gt;cat -->\n",
       "<g id=\"edge2\" class=\"edge\"><title>bert&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M151.688,-108.625C160.004,-104.089 168.841,-99.2684 176.918,-94.863\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"178.598,-97.9337 185.701,-90.0724 175.246,-91.7884 178.598,-97.9337\"/>\n",
       "</g>\n",
       "<!-- tfidf -->\n",
       "<g id=\"node3\" class=\"node\"><title>tfidf</title>\n",
       "<g id=\"a_node3\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\" xlink:title=\"tfidf = Tfidf\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"117\" cy=\"-84\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-81.2\" font-family=\"Times,serif\" font-size=\"11.00\">Tfidf</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_1 -->\n",
       "<g id=\"node4\" class=\"node\"><title>project_1</title>\n",
       "<g id=\"a_node4\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.project.html\" xlink:title=\"project_1 = Project(columns={&#39;type&#39;: &#39;number&#39;})\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"117\" cy=\"-32\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"117\" y=\"-29.2\" font-family=\"Times,serif\" font-size=\"11.00\">Project</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- project_1&#45;&gt;cat -->\n",
       "<g id=\"edge3\" class=\"edge\"><title>project_1&#45;&gt;cat</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M138.855,-43.1383C150.033,-49.1087 163.998,-56.5671 176.293,-63.1335\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"174.825,-66.3177 185.295,-67.9416 178.123,-60.1432 174.825,-66.3177\"/>\n",
       "</g>\n",
       "<!-- lin_r -->\n",
       "<g id=\"node6\" class=\"node\"><title>lin_r</title>\n",
       "<g id=\"a_node6\"><a xlink:href=\"https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html\" xlink:title=\"lin_r = LinR\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"297\" cy=\"-79\" rx=\"27\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"297\" y=\"-82.2\" font-family=\"Times,serif\" font-size=\"11.00\">Lin&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"297\" y=\"-70.2\" font-family=\"Times,serif\" font-size=\"11.00\">R</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- cat&#45;&gt;lin_r -->\n",
       "<g id=\"edge4\" class=\"edge\"><title>cat&#45;&gt;lin_r</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M234.403,-79C242.393,-79 251.311,-79 259.824,-79\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"252,-82.5005 262,-79 252,-75.5005 252,-82.5005\"/>\n",
       "</g>\n",
       "<!-- xgb -->\n",
       "<g id=\"node7\" class=\"node\"><title>xgb</title>\n",
       "<g id=\"a_node7\"><a xlink:href=\"https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn\" xlink:title=\"xgb = XGB\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"297\" cy=\"-34\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"297\" y=\"-31.2\" font-family=\"Times,serif\" font-size=\"11.00\">XGB</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x7fa336a8b978>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from lale.lib.lale import Project\n",
    "from lale.lib.sklearn import TfidfVectorizer as Tfidf\n",
    "from lale.lib.pytorch import BertPretrainedEncoder as Bert\n",
    "from lale.lib.lale import ConcatFeatures as Cat\n",
    "from lale.lib.sklearn import LinearRegression as LinR\n",
    "from lale.lib.xgboost import XGBRegressor as XGB\n",
    "planned = (\n",
    "        Project(columns=['review']) >> (Bert | Tfidf)\n",
    "      & Project(columns={'type': 'number'})\n",
    "    ) >> Cat >> (LinR | XGB)\n",
    "planned.visualize()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}