{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Important Disclaimer:** Mockup. So far, the example here does not support\n",
    "fit or predict, let alone hyperparameter tuning etc.\n",
    "\n",
    "The pipeline shown here assumes the example input tables from\n",
    "<a href=\"https://arxiv.org/pdf/1706.00327.pdf#page=3\">Fig. 2</a>\n",
    "of the following paper:\n",
    "Hoang Thanh Lam, Johann-Michael Thiebaut, Mathieu Sinn, Bei Chen, Tiep Mai, and Oznur Alkan.\n",
    "\"One button machine for automating feature engineering in relational databases\". 2017. \n",
    "https://arxiv.org/abs/1706.00327"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from lale.expressions import it, replace, sum, max, count, month, day_of_month, item\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from lale.lib.lale import Scan, Join, Map, GroupBy, Aggregate, ConcatFeatures\n",
    "from sklearn.feature_selection import SelectKBest as SelectFeatures\n",
    "from sklearn.pipeline import Pipeline\n",
    "from lale.lib.autoai_libs import NumpyColumnSelector, CatEncoder, OptStandardScaler, FS1\n",
    "from sklearn.linear_model import LogisticRegression as LR\n",
    "from sklearn.neighbors import KNeighborsClassifier as KNN\n",
    "from xgboost import XGBClassifier as XGBoost\n",
    "import lale\n",
    "lale.wrap_imported_operators()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# one-to-one path doesn't need GroupBy >> Aggregate\n",
    "info_features = (\n",
    "       (Scan(table=it.main) & Scan(table=it.info))\n",
    "    >> Join(pred=[it.main.TrainId == it.info.Train_Id, #note the underscore\n",
    "                  it.main['Arrival time'] >= it.info.TimeStamp])\n",
    "    >> Map(columns=[replace(it['Train class'], {'Regional': 0, 'Intercity': 1}),\n",
    "                    it['Max Speed (km/h)'],\n",
    "                    month(it['Arrival time'], fmt='YYYY-MM-DD HH:MM:SS'),\n",
    "                    day_of_month(it['Arrival time'])]))\n",
    "# one-to-many path (multiple delay rows per main-table row)\n",
    "delay_features = (\n",
    "       (Scan(table=it.main) & Scan(table=it.delay))\n",
    "    >> Join(pred=[it.main.TrainId == it.delay.TrainId,\n",
    "                  it.main['Arrival time'] >= it.delay.TimeStamp])\n",
    "    >> GroupBy(key=it.MessageId) #primary key of main table\n",
    "    >> Aggregate(columns=[sum(it.Delay), max(it.Delay)]))\n",
    "# multi-hop one-to-many path uses multi-way join\n",
    "event_features = (\n",
    "       (Scan(table=it.main) & Scan(table=it.delay) & Scan(table=it.event))\n",
    "    >> Join(pred=[it.main.TrainId == it.delay.TrainId,\n",
    "                  it.main['Arrival time'] >= it.delay.TimeStamp,\n",
    "                  it.delay.StationId == it.event.StationId,\n",
    "                  it.main.TimeStamp >= it.event.TimeStamp])\n",
    "    >> GroupBy(key=it.MessageId) #primary key of main table\n",
    "    >> Aggregate(columns=[count(it.Event),\n",
    "                          item(it['Train class'], 'Roadwork')]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_features = Pipeline(steps=[('data_joins',\n",
    "       (info_features & delay_features & event_features)\n",
    "    >> ConcatFeatures >> SelectFeatures())])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "cats_prep = NumpyColumnSelector(columns=[0]) >> CatEncoder(dtype=np.float64)\n",
    "cont_prep = NumpyColumnSelector(columns=[1,2]) >> OptStandardScaler(use_scaler_flag=True)\n",
    "all_prep = Pipeline(steps=[('preprocessing',\n",
    "    (cats_prep & cont_prep) >> ConcatFeatures >> FS1(additional_col_count_to_keep=3))])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "classifier = LR | KNN | XGBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline = all_features >> all_prep >> classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
       " -->\n",
       "<!-- Title: cluster:(root) Pages: 1 -->\n",
       "<svg width=\"1091pt\" height=\"329pt\"\n",
       " viewBox=\"0.00 0.00 1090.51 329.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 325)\">\n",
       "<title>cluster:(root)</title>\n",
       "<g id=\"a_graph0\"><a xlink:title=\"(root) = ...\">\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-325 1086.51,-325 1086.51,4 -4,4\"/>\n",
       "</a>\n",
       "</g>\n",
       "<g id=\"clust1\" class=\"cluster\"><title>cluster:pipeline_1</title>\n",
       "<g id=\"a_clust1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.pipeline.html\" xlink:title=\"pipeline_1 = Pipeline(steps=[(&#39;data_joins&#39;, pipeline_2)])\">\n",
       "<polygon fill=\"#b0e2ff\" stroke=\"black\" points=\"8,-30 8,-313 585.453,-313 585.453,-30 8,-30\"/>\n",
       "<text text-anchor=\"middle\" x=\"296.727\" y=\"-297.8\" font-family=\"Times,serif\" font-size=\"14.00\">Pipeline: data_joins</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<g id=\"clust2\" class=\"cluster\"><title>cluster:pipeline_2</title>\n",
       "<g id=\"a_clust2\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.pipeline.html\" xlink:title=\"pipeline_2 = ...\">\n",
       "<path fill=\"#b0e2ff\" stroke=\"black\" d=\"M28,-38C28,-38 565.453,-38 565.453,-38 571.453,-38 577.453,-44 577.453,-50 577.453,-50 577.453,-270 577.453,-270 577.453,-276 571.453,-282 565.453,-282 565.453,-282 28,-282 28,-282 22,-282 16,-276 16,-270 16,-270 16,-50 16,-50 16,-44 22,-38 28,-38\"/>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<g id=\"clust3\" class=\"cluster\"><title>cluster:pipeline_3</title>\n",
       "<g id=\"a_clust3\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.pipeline.html\" xlink:title=\"pipeline_3 = Pipeline(steps=[(&#39;preprocessing&#39;, pipeline_4)])\">\n",
       "<polygon fill=\"#b0e2ff\" stroke=\"black\" points=\"593.453,-54 593.453,-229 992.514,-229 992.514,-54 593.453,-54\"/>\n",
       "<text text-anchor=\"middle\" x=\"792.984\" y=\"-213.8\" font-family=\"Times,serif\" font-size=\"14.00\">Pipeline: preprocessing</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<g id=\"clust4\" class=\"cluster\"><title>cluster:pipeline_4</title>\n",
       "<g id=\"a_clust4\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.pipeline.html\" xlink:title=\"pipeline_4 = ...\">\n",
       "<path fill=\"#b0e2ff\" stroke=\"black\" d=\"M613.453,-62C613.453,-62 972.514,-62 972.514,-62 978.514,-62 984.514,-68 984.514,-74 984.514,-74 984.514,-186 984.514,-186 984.514,-192 978.514,-198 972.514,-198 972.514,-198 613.453,-198 613.453,-198 607.453,-198 601.453,-192 601.453,-186 601.453,-186 601.453,-74 601.453,-74 601.453,-68 607.453,-62 613.453,-62\"/>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<g id=\"clust5\" class=\"cluster\"><title>cluster:choice</title>\n",
       "<g id=\"a_clust5\"><a xlink:title=\"choice = lr | knn | xg_boost\">\n",
       "<polygon fill=\"#7ec0ee\" stroke=\"black\" points=\"1004.51,-8 1004.51,-173 1074.51,-173 1074.51,-8 1004.51,-8\"/>\n",
       "<text text-anchor=\"middle\" x=\"1039.51\" y=\"-157.8\" font-family=\"Times,serif\" font-size=\"14.00\">Choice</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- scan -->\n",
       "<g id=\"node1\" class=\"node\"><title>scan</title>\n",
       "<g id=\"a_node1\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.scan.html\" xlink:title=\"scan = Scan(table=it.main)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"146.974\" cy=\"-254\" rx=\"27.6545\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"146.974\" y=\"-257.2\" font-family=\"Times,serif\" font-size=\"11.00\">Scan:</text>\n",
       "<text text-anchor=\"middle\" x=\"146.974\" y=\"-245.2\" font-family=\"Times,serif\" font-size=\"11.00\">it.main</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- join -->\n",
       "<g id=\"node3\" class=\"node\"><title>join</title>\n",
       "<g id=\"a_node3\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.join.html\" xlink:title=\"join = Join(pred=[(it.main.TrainId == it.info.Train_Id), (it.main[&#39;Arrival time&#39;] &gt;= it.info.TimeStamp)])\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"238.836\" cy=\"-207\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"238.836\" y=\"-204.2\" font-family=\"Times,serif\" font-size=\"11.00\">Join</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- scan&#45;&gt;join -->\n",
       "<g id=\"edge1\" class=\"edge\"><title>scan&#45;&gt;join</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M169.727,-242.623C181.234,-236.604 195.534,-229.125 208.056,-222.576\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"209.97,-225.524 217.209,-217.788 206.726,-219.322 209.97,-225.524\"/>\n",
       "</g>\n",
       "<!-- scan_0 -->\n",
       "<g id=\"node2\" class=\"node\"><title>scan_0</title>\n",
       "<g id=\"a_node2\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.scan.html\" xlink:title=\"scan_0 = Scan(table=it.info)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"146.974\" cy=\"-207\" rx=\"27\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"146.974\" y=\"-210.2\" font-family=\"Times,serif\" font-size=\"11.00\">Scan:</text>\n",
       "<text text-anchor=\"middle\" x=\"146.974\" y=\"-198.2\" font-family=\"Times,serif\" font-size=\"11.00\">it.info</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- scan_0&#45;&gt;join -->\n",
       "<g id=\"edge2\" class=\"edge\"><title>scan_0&#45;&gt;join</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M174.449,-207C182.859,-207 192.313,-207 201.301,-207\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"201.549,-210.5 211.549,-207 201.549,-203.5 201.549,-210.5\"/>\n",
       "</g>\n",
       "<!-- map -->\n",
       "<g id=\"node4\" class=\"node\"><title>map</title>\n",
       "<g id=\"a_node4\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.map.html\" xlink:title=\"map = Map(columns=[replace(\n",
       "{&#39;Intercity&#39;: 1, &#39;Regional&#39;: 0}), it[&#39;Max Speed (km/h)&#39;], month(it[&#39;Arrival time&#39;], &#39;YYYY&#45;MM&#45;DD HH:MM:SS&#39;), day_of_month(it[&#39;Arrival time&#39;])])\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"333.818\" cy=\"-206\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"333.818\" y=\"-203.2\" font-family=\"Times,serif\" font-size=\"11.00\">Map</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- join&#45;&gt;map -->\n",
       "<g id=\"edge3\" class=\"edge\"><title>join&#45;&gt;map</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M266.232,-206.716C275.636,-206.615 286.399,-206.499 296.49,-206.391\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"296.716,-209.889 306.678,-206.281 296.641,-202.889 296.716,-209.889\"/>\n",
       "</g>\n",
       "<!-- concat_features -->\n",
       "<g id=\"node16\" class=\"node\"><title>concat_features</title>\n",
       "<g id=\"a_node16\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.concat_features.html\" xlink:title=\"concat_features = ConcatFeatures\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"433.751\" cy=\"-162\" rx=\"33.4697\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"433.751\" y=\"-165.2\" font-family=\"Times,serif\" font-size=\"11.00\">Concat&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"433.751\" y=\"-153.2\" font-family=\"Times,serif\" font-size=\"11.00\">Features</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- map&#45;&gt;concat_features -->\n",
       "<g id=\"edge13\" class=\"edge\"><title>map&#45;&gt;concat_features</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M356.553,-196.235C368.549,-190.845 383.741,-184.019 397.399,-177.883\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"398.864,-181.062 406.551,-173.771 395.995,-174.677 398.864,-181.062\"/>\n",
       "</g>\n",
       "<!-- scan_1 -->\n",
       "<g id=\"node5\" class=\"node\"><title>scan_1</title>\n",
       "<g id=\"a_node5\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.scan.html\" xlink:title=\"scan_1 = Scan(table=it.main)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"53.6985\" cy=\"-254\" rx=\"27.6545\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"53.6985\" y=\"-257.2\" font-family=\"Times,serif\" font-size=\"11.00\">Scan:</text>\n",
       "<text text-anchor=\"middle\" x=\"53.6985\" y=\"-245.2\" font-family=\"Times,serif\" font-size=\"11.00\">it.main</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- join_0 -->\n",
       "<g id=\"node7\" class=\"node\"><title>join_0</title>\n",
       "<g id=\"a_node7\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.join.html\" xlink:title=\"join_0 = Join(pred=[(it.main.TrainId == it.delay.TrainId), (it.main[&#39;Arrival time&#39;] &gt;= it.delay.TimeStamp)])\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"146.974\" cy=\"-162\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"146.974\" y=\"-159.2\" font-family=\"Times,serif\" font-size=\"11.00\">Join</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- scan_1&#45;&gt;join_0 -->\n",
       "<g id=\"edge4\" class=\"edge\"><title>scan_1&#45;&gt;join_0</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M73.1251,-239.575C76.656,-236.541 80.2347,-233.267 83.397,-230 101.454,-211.348 101.293,-202.607 119.397,-184 119.93,-183.453 120.475,-182.905 121.031,-182.359\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"123.586,-184.766 128.622,-175.445 118.872,-179.591 123.586,-184.766\"/>\n",
       "</g>\n",
       "<!-- scan_2 -->\n",
       "<g id=\"node6\" class=\"node\"><title>scan_2</title>\n",
       "<g id=\"a_node6\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.scan.html\" xlink:title=\"scan_2 = Scan(table=it.delay)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"53.6985\" cy=\"-207\" rx=\"29.8983\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"53.6985\" y=\"-210.2\" font-family=\"Times,serif\" font-size=\"11.00\">Scan:</text>\n",
       "<text text-anchor=\"middle\" x=\"53.6985\" y=\"-198.2\" font-family=\"Times,serif\" font-size=\"11.00\">it.delay</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- scan_2&#45;&gt;join_0 -->\n",
       "<g id=\"edge5\" class=\"edge\"><title>scan_2&#45;&gt;join_0</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M78.2042,-195.411C89.5409,-189.822 103.282,-183.048 115.419,-177.064\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"117.311,-180.034 124.732,-172.472 114.215,-173.755 117.311,-180.034\"/>\n",
       "</g>\n",
       "<!-- group_by -->\n",
       "<g id=\"node8\" class=\"node\"><title>group_by</title>\n",
       "<g id=\"a_node8\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.join.html\" xlink:title=\"group_by = GroupBy(key=it.MessageId)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"238.836\" cy=\"-162\" rx=\"28.0702\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"238.836\" y=\"-165.2\" font-family=\"Times,serif\" font-size=\"11.00\">Group&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"238.836\" y=\"-153.2\" font-family=\"Times,serif\" font-size=\"11.00\">By</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- join_0&#45;&gt;group_by -->\n",
       "<g id=\"edge6\" class=\"edge\"><title>join_0&#45;&gt;group_by</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M174.449,-162C182.473,-162 191.447,-162 200.061,-162\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"200.302,-165.5 210.302,-162 200.302,-158.5 200.302,-165.5\"/>\n",
       "</g>\n",
       "<!-- aggregate -->\n",
       "<g id=\"node9\" class=\"node\"><title>aggregate</title>\n",
       "<g id=\"a_node9\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.aggregate.html\" xlink:title=\"aggregate = Aggregate(columns=[sum(it.Delay), max(it.Delay)])\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"333.818\" cy=\"-162\" rx=\"30.8985\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"333.818\" y=\"-159.2\" font-family=\"Times,serif\" font-size=\"11.00\">Aggregate</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- group_by&#45;&gt;aggregate -->\n",
       "<g id=\"edge7\" class=\"edge\"><title>group_by&#45;&gt;aggregate</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M267.231,-162C275.228,-162 284.139,-162 292.749,-162\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"293.022,-165.5 303.022,-162 293.022,-158.5 293.022,-165.5\"/>\n",
       "</g>\n",
       "<!-- aggregate&#45;&gt;concat_features -->\n",
       "<g id=\"edge14\" class=\"edge\"><title>aggregate&#45;&gt;concat_features</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M364.734,-162C372.732,-162 381.533,-162 390.075,-162\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"390.294,-165.5 400.294,-162 390.294,-158.5 390.294,-165.5\"/>\n",
       "</g>\n",
       "<!-- scan_3 -->\n",
       "<g id=\"node10\" class=\"node\"><title>scan_3</title>\n",
       "<g id=\"a_node10\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.scan.html\" xlink:title=\"scan_3 = Scan(table=it.main)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"53.6985\" cy=\"-160\" rx=\"27.6545\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"53.6985\" y=\"-163.2\" font-family=\"Times,serif\" font-size=\"11.00\">Scan:</text>\n",
       "<text text-anchor=\"middle\" x=\"53.6985\" y=\"-151.2\" font-family=\"Times,serif\" font-size=\"11.00\">it.main</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- join_1 -->\n",
       "<g id=\"node13\" class=\"node\"><title>join_1</title>\n",
       "<g id=\"a_node13\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.join.html\" xlink:title=\"join_1 = Join(pred=[(it.main.TrainId == it.delay.TrainId), (it.main[&#39;Arrival time&#39;] &gt;= it.delay.TimeStamp), (it.delay.StationId == it.event.StationId), (it.main.TimeStamp &gt;= it.event.TimeStamp)])\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"146.974\" cy=\"-114\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"146.974\" y=\"-111.2\" font-family=\"Times,serif\" font-size=\"11.00\">Join</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- scan_3&#45;&gt;join_1 -->\n",
       "<g id=\"edge8\" class=\"edge\"><title>scan_3&#45;&gt;join_1</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M76.7938,-148.865C88.4818,-142.974 103.007,-135.654 115.726,-129.244\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"117.668,-132.185 125.023,-124.559 114.518,-125.934 117.668,-132.185\"/>\n",
       "</g>\n",
       "<!-- scan_4 -->\n",
       "<g id=\"node11\" class=\"node\"><title>scan_4</title>\n",
       "<g id=\"a_node11\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.scan.html\" xlink:title=\"scan_4 = Scan(table=it.delay)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"53.6985\" cy=\"-113\" rx=\"29.8983\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"53.6985\" y=\"-116.2\" font-family=\"Times,serif\" font-size=\"11.00\">Scan:</text>\n",
       "<text text-anchor=\"middle\" x=\"53.6985\" y=\"-104.2\" font-family=\"Times,serif\" font-size=\"11.00\">it.delay</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- scan_4&#45;&gt;join_1 -->\n",
       "<g id=\"edge9\" class=\"edge\"><title>scan_4&#45;&gt;join_1</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M83.5787,-113.316C91.8218,-113.407 100.912,-113.506 109.544,-113.601\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"109.719,-117.103 119.757,-113.713 109.796,-110.103 109.719,-117.103\"/>\n",
       "</g>\n",
       "<!-- scan_5 -->\n",
       "<g id=\"node12\" class=\"node\"><title>scan_5</title>\n",
       "<g id=\"a_node12\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.scan.html\" xlink:title=\"scan_5 = Scan(table=it.event)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"53.6985\" cy=\"-66\" rx=\"29.8983\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"53.6985\" y=\"-69.2\" font-family=\"Times,serif\" font-size=\"11.00\">Scan:</text>\n",
       "<text text-anchor=\"middle\" x=\"53.6985\" y=\"-57.2\" font-family=\"Times,serif\" font-size=\"11.00\">it.event</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- scan_5&#45;&gt;join_1 -->\n",
       "<g id=\"edge10\" class=\"edge\"><title>scan_5&#45;&gt;join_1</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M77.7312,-78.1124C89.3465,-84.2207 103.575,-91.7033 116.029,-98.2525\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"114.654,-101.484 125.134,-103.04 117.912,-95.2882 114.654,-101.484\"/>\n",
       "</g>\n",
       "<!-- group_by_0 -->\n",
       "<g id=\"node14\" class=\"node\"><title>group_by_0</title>\n",
       "<g id=\"a_node14\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.join.html\" xlink:title=\"group_by_0 = GroupBy(key=it.MessageId)\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"238.836\" cy=\"-115\" rx=\"28.0702\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"238.836\" y=\"-118.2\" font-family=\"Times,serif\" font-size=\"11.00\">Group&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"238.836\" y=\"-106.2\" font-family=\"Times,serif\" font-size=\"11.00\">By</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- join_1&#45;&gt;group_by_0 -->\n",
       "<g id=\"edge11\" class=\"edge\"><title>join_1&#45;&gt;group_by_0</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M174.449,-114.295C182.473,-114.384 191.447,-114.484 200.061,-114.58\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"200.263,-118.082 210.302,-114.694 200.341,-111.082 200.263,-118.082\"/>\n",
       "</g>\n",
       "<!-- aggregate_0 -->\n",
       "<g id=\"node15\" class=\"node\"><title>aggregate_0</title>\n",
       "<g id=\"a_node15\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.aggregate.html\" xlink:title=\"aggregate_0 = Aggregate(columns=[count(it.Event), item(it[&#39;Train class&#39;], &#39;Roadwork&#39;)])\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"333.818\" cy=\"-117\" rx=\"30.8985\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"333.818\" y=\"-114.2\" font-family=\"Times,serif\" font-size=\"11.00\">Aggregate</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- group_by_0&#45;&gt;aggregate_0 -->\n",
       "<g id=\"edge12\" class=\"edge\"><title>group_by_0&#45;&gt;aggregate_0</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M267.231,-115.589C275.228,-115.761 284.139,-115.953 292.749,-116.138\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"292.949,-119.643 303.022,-116.359 293.1,-112.645 292.949,-119.643\"/>\n",
       "</g>\n",
       "<!-- aggregate_0&#45;&gt;concat_features -->\n",
       "<g id=\"edge15\" class=\"edge\"><title>aggregate_0&#45;&gt;concat_features</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M358.525,-127.893C370.278,-133.293 384.703,-139.922 397.702,-145.895\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"396.308,-149.106 406.856,-150.101 399.231,-142.746 396.308,-149.106\"/>\n",
       "</g>\n",
       "<!-- select_features -->\n",
       "<g id=\"node17\" class=\"node\"><title>select_features</title>\n",
       "<g id=\"a_node17\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.select_k_best.html\" xlink:title=\"select_features = SelectFeatures()\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"536.219\" cy=\"-162\" rx=\"33.4697\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"536.219\" y=\"-165.2\" font-family=\"Times,serif\" font-size=\"11.00\">Select&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"536.219\" y=\"-153.2\" font-family=\"Times,serif\" font-size=\"11.00\">Features</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- concat_features&#45;&gt;select_features -->\n",
       "<g id=\"edge16\" class=\"edge\"><title>concat_features&#45;&gt;select_features</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M467.093,-162C475.249,-162 484.129,-162 492.702,-162\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"492.938,-165.5 502.938,-162 492.938,-158.5 492.938,-165.5\"/>\n",
       "</g>\n",
       "<!-- numpy_column_selector -->\n",
       "<g id=\"node18\" class=\"node\"><title>numpy_column_selector</title>\n",
       "<g id=\"a_node18\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.autoai_libs.numpy_column_selector.html\" xlink:title=\"numpy_column_selector = NumpyColumnSelector(columns=[0])\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"642.687\" cy=\"-162\" rx=\"33.4697\" ry=\"28.0702\"/>\n",
       "<text text-anchor=\"middle\" x=\"642.687\" y=\"-171.2\" font-family=\"Times,serif\" font-size=\"11.00\">Numpy&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"642.687\" y=\"-159.2\" font-family=\"Times,serif\" font-size=\"11.00\">Column&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"642.687\" y=\"-147.2\" font-family=\"Times,serif\" font-size=\"11.00\">Selector</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- select_features&#45;&gt;numpy_column_selector -->\n",
       "<g id=\"edge22\" class=\"edge\"><title>select_features&#45;&gt;numpy_column_selector</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M585.453,-162C587.496,-162 589.539,-162 591.582,-162\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"583.625,-165.5 593.624,-162 583.624,-158.5 583.625,-165.5\"/>\n",
       "</g>\n",
       "<!-- cat_encoder -->\n",
       "<g id=\"node19\" class=\"node\"><title>cat_encoder</title>\n",
       "<g id=\"a_node19\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.autoai_libs.cat_encoder.html\" xlink:title=\"cat_encoder = CatEncoder(encoding=&#39;ordinal&#39;, categories=&#39;auto&#39;, dtype=np.float64, handle_unknown=&#39;ignore&#39;)\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"747.984\" cy=\"-159\" rx=\"31.6406\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"747.984\" y=\"-162.2\" font-family=\"Times,serif\" font-size=\"11.00\">Cat&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"747.984\" y=\"-150.2\" font-family=\"Times,serif\" font-size=\"11.00\">Encoder</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- numpy_column_selector&#45;&gt;cat_encoder -->\n",
       "<g id=\"edge17\" class=\"edge\"><title>numpy_column_selector&#45;&gt;cat_encoder</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M676.087,-161.059C685.573,-160.784 696.068,-160.479 706.012,-160.19\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"706.215,-163.686 716.109,-159.897 706.012,-156.689 706.215,-163.686\"/>\n",
       "</g>\n",
       "<!-- concat_features_0 -->\n",
       "<g id=\"node22\" class=\"node\"><title>concat_features_0</title>\n",
       "<g id=\"a_node22\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.concat_features.html\" xlink:title=\"concat_features_0 = ConcatFeatures\">\n",
       "<ellipse fill=\"white\" stroke=\"black\" cx=\"853.28\" cy=\"-124\" rx=\"33.4697\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"853.28\" y=\"-127.2\" font-family=\"Times,serif\" font-size=\"11.00\">Concat&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"853.28\" y=\"-115.2\" font-family=\"Times,serif\" font-size=\"11.00\">Features</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- cat_encoder&#45;&gt;concat_features_0 -->\n",
       "<g id=\"edge19\" class=\"edge\"><title>cat_encoder&#45;&gt;concat_features_0</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M776.394,-149.713C787.996,-145.781 801.708,-141.135 814.263,-136.881\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"815.673,-140.099 824.021,-133.575 813.426,-133.469 815.673,-140.099\"/>\n",
       "</g>\n",
       "<!-- numpy_column_selector_0 -->\n",
       "<g id=\"node20\" class=\"node\"><title>numpy_column_selector_0</title>\n",
       "<g id=\"a_node20\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.autoai_libs.numpy_column_selector.html\" xlink:title=\"numpy_column_selector_0 = NumpyColumnSelector(columns=[1, 2])\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"642.687\" cy=\"-98\" rx=\"33.4697\" ry=\"28.0702\"/>\n",
       "<text text-anchor=\"middle\" x=\"642.687\" y=\"-107.2\" font-family=\"Times,serif\" font-size=\"11.00\">Numpy&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"642.687\" y=\"-95.2\" font-family=\"Times,serif\" font-size=\"11.00\">Column&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"642.687\" y=\"-83.2\" font-family=\"Times,serif\" font-size=\"11.00\">Selector</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- opt_standard_scaler -->\n",
       "<g id=\"node21\" class=\"node\"><title>opt_standard_scaler</title>\n",
       "<g id=\"a_node21\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.autoai_libs.opt_standard_scaler.html\" xlink:title=\"opt_standard_scaler = OptStandardScaler()\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"747.984\" cy=\"-102\" rx=\"36.125\" ry=\"28.0702\"/>\n",
       "<text text-anchor=\"middle\" x=\"747.984\" y=\"-111.2\" font-family=\"Times,serif\" font-size=\"11.00\">Opt&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"747.984\" y=\"-99.2\" font-family=\"Times,serif\" font-size=\"11.00\">Standard&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"747.984\" y=\"-87.2\" font-family=\"Times,serif\" font-size=\"11.00\">Scaler</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- numpy_column_selector_0&#45;&gt;opt_standard_scaler -->\n",
       "<g id=\"edge18\" class=\"edge\"><title>numpy_column_selector_0&#45;&gt;opt_standard_scaler</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M676.087,-99.2547C684.252,-99.5708 693.166,-99.916 701.832,-100.252\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"701.704,-103.749 711.832,-100.639 701.975,-96.7544 701.704,-103.749\"/>\n",
       "</g>\n",
       "<!-- opt_standard_scaler&#45;&gt;concat_features_0 -->\n",
       "<g id=\"edge20\" class=\"edge\"><title>opt_standard_scaler&#45;&gt;concat_features_0</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M783.093,-109.265C792.242,-111.213 802.224,-113.339 811.687,-115.355\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"811.178,-118.824 821.687,-117.484 812.636,-111.978 811.178,-118.824\"/>\n",
       "</g>\n",
       "<!-- fs1 -->\n",
       "<g id=\"node23\" class=\"node\"><title>fs1</title>\n",
       "<g id=\"a_node23\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.autoai_libs.fs1.html\" xlink:title=\"fs1 = FS1(cols_ids_must_keep=[], additional_col_count_to_keep=3, ptype=&#39;classification&#39;)\">\n",
       "<ellipse fill=\"#b0e2ff\" stroke=\"black\" cx=\"949.514\" cy=\"-124\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"949.514\" y=\"-121.2\" font-family=\"Times,serif\" font-size=\"11.00\">FS1</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- concat_features_0&#45;&gt;fs1 -->\n",
       "<g id=\"edge21\" class=\"edge\"><title>concat_features_0&#45;&gt;fs1</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M886.71,-124C894.871,-124 903.684,-124 912.029,-124\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"912.263,-127.5 922.263,-124 912.263,-120.5 912.263,-127.5\"/>\n",
       "</g>\n",
       "<!-- lr -->\n",
       "<g id=\"node24\" class=\"node\"><title>lr</title>\n",
       "<g id=\"a_node24\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.logistic_regression.html\" xlink:title=\"lr = LR\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"1039.51\" cy=\"-124\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"1039.51\" y=\"-121.2\" font-family=\"Times,serif\" font-size=\"11.00\">LR</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- fs1&#45;&gt;lr -->\n",
       "<g id=\"edge23\" class=\"edge\"><title>fs1&#45;&gt;lr</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M992.256,-124C995.625,-124 999.013,-124 1002.34,-124\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"994.514,-127.5 1004.51,-124 994.514,-120.5 994.514,-127.5\"/>\n",
       "</g>\n",
       "<!-- knn -->\n",
       "<g id=\"node25\" class=\"node\"><title>knn</title>\n",
       "<g id=\"a_node25\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.k_neighbors_classifier.html\" xlink:title=\"knn = KNN\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"1039.51\" cy=\"-81\" rx=\"27\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"1039.51\" y=\"-78.2\" font-family=\"Times,serif\" font-size=\"11.00\">KNN</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "<!-- xg_boost -->\n",
       "<g id=\"node26\" class=\"node\"><title>xg_boost</title>\n",
       "<g id=\"a_node26\"><a xlink:href=\"https://lale.readthedocs.io/en/latest/modules/lale.lib.xgboost.xgb_classifier.html\" xlink:title=\"xg_boost = XGBoost\">\n",
       "<ellipse fill=\"#7ec0ee\" stroke=\"black\" cx=\"1039.51\" cy=\"-36\" rx=\"27\" ry=\"19.6\"/>\n",
       "<text text-anchor=\"middle\" x=\"1039.51\" y=\"-39.2\" font-family=\"Times,serif\" font-size=\"11.00\">XG&#45;</text>\n",
       "<text text-anchor=\"middle\" x=\"1039.51\" y=\"-27.2\" font-family=\"Times,serif\" font-size=\"11.00\">Boost</text>\n",
       "</a>\n",
       "</g>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.dot.Digraph at 0x7f61d2f12cc0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pipeline.visualize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "```python\n",
       "from sklearn.pipeline import Pipeline\n",
       "from lale.lib.lale import Scan\n",
       "from lale.expressions import it\n",
       "from lale.lib.lale import Join\n",
       "from lale.lib.lale import Map\n",
       "from lale.expressions import replace\n",
       "from lale.expressions import month\n",
       "from lale.expressions import day_of_month\n",
       "from lale.lib.lale import GroupBy\n",
       "from lale.lib.lale import Aggregate\n",
       "from lale.expressions import sum\n",
       "from lale.expressions import max\n",
       "from lale.expressions import count\n",
       "from lale.expressions import item\n",
       "from lale.lib.lale import ConcatFeatures\n",
       "from sklearn.feature_selection import SelectKBest as SelectFeatures\n",
       "from autoai_libs.transformers.exportable import NumpyColumnSelector\n",
       "from autoai_libs.transformers.exportable import CatEncoder\n",
       "import numpy as np\n",
       "from autoai_libs.transformers.exportable import OptStandardScaler\n",
       "from autoai_libs.cognito.transforms.transform_utils import FS1\n",
       "from sklearn.linear_model import LogisticRegression as LR\n",
       "from sklearn.neighbors import KNeighborsClassifier as KNN\n",
       "from xgboost import XGBClassifier as XGBoost\n",
       "import lale\n",
       "\n",
       "lale.wrap_imported_operators()\n",
       "scan = Scan(table=it.main)\n",
       "scan_0 = Scan(table=it.info)\n",
       "join = Join(\n",
       "    pred=[\n",
       "        (it.main.TrainId == it.info.Train_Id),\n",
       "        (it.main[\"Arrival time\"] >= it.info.TimeStamp),\n",
       "    ]\n",
       ")\n",
       "scan_1 = Scan(table=it.main)\n",
       "scan_2 = Scan(table=it.delay)\n",
       "join_0 = Join(\n",
       "    pred=[\n",
       "        (it.main.TrainId == it.delay.TrainId),\n",
       "        (it.main[\"Arrival time\"] >= it.delay.TimeStamp),\n",
       "    ]\n",
       ")\n",
       "group_by = GroupBy(key=it.MessageId)\n",
       "aggregate = Aggregate(columns=[sum(it.Delay), max(it.Delay)])\n",
       "scan_3 = Scan(table=it.main)\n",
       "scan_4 = Scan(table=it.delay)\n",
       "scan_5 = Scan(table=it.event)\n",
       "join_1 = Join(\n",
       "    pred=[\n",
       "        (it.main.TrainId == it.delay.TrainId),\n",
       "        (it.main[\"Arrival time\"] >= it.delay.TimeStamp),\n",
       "        (it.delay.StationId == it.event.StationId),\n",
       "        (it.main.TimeStamp >= it.event.TimeStamp),\n",
       "    ]\n",
       ")\n",
       "group_by_0 = GroupBy(key=it.MessageId)\n",
       "aggregate_0 = Aggregate(\n",
       "    columns=[count(it.Event), item(it[\"Train class\"], \"Roadwork\")]\n",
       ")\n",
       "numpy_column_selector = NumpyColumnSelector(columns=[0])\n",
       "cat_encoder = CatEncoder(\n",
       "    encoding=\"ordinal\",\n",
       "    categories=\"auto\",\n",
       "    dtype=np.float64,\n",
       "    handle_unknown=\"ignore\",\n",
       ")\n",
       "numpy_column_selector_0 = NumpyColumnSelector(columns=[1, 2])\n",
       "fs1 = FS1(\n",
       "    cols_ids_must_keep=[],\n",
       "    additional_col_count_to_keep=3,\n",
       "    ptype=\"classification\",\n",
       ")\n",
       "pipeline_3 = Pipeline(\n",
       "    steps=[\n",
       "        (\n",
       "            \"preprocessing\",\n",
       "            (\n",
       "                (numpy_column_selector >> cat_encoder)\n",
       "                & (numpy_column_selector_0 >> OptStandardScaler())\n",
       "            )\n",
       "            >> ConcatFeatures\n",
       "            >> fs1,\n",
       "        )\n",
       "    ]\n",
       ")\n",
       "pipeline = (\n",
       "    Pipeline(\n",
       "        steps=[\n",
       "            (\n",
       "                \"data_joins\",\n",
       "                (\n",
       "                    (\n",
       "                        (scan & scan_0)\n",
       "                        >> join\n",
       "                        >> Map(\n",
       "                            columns=[\n",
       "                                replace({\"Intercity\": 1, \"Regional\": 0}),\n",
       "                                it[\"Max Speed (km/h)\"],\n",
       "                                month(\n",
       "                                    it[\"Arrival time\"], \"YYYY-MM-DD HH:MM:SS\"\n",
       "                                ),\n",
       "                                day_of_month(it[\"Arrival time\"]),\n",
       "                            ]\n",
       "                        )\n",
       "                    )\n",
       "                    & ((scan_1 & scan_2) >> join_0 >> group_by >> aggregate)\n",
       "                    & (\n",
       "                        (scan_3 & scan_4 & scan_5)\n",
       "                        >> join_1\n",
       "                        >> group_by_0\n",
       "                        >> aggregate_0\n",
       "                    )\n",
       "                )\n",
       "                >> ConcatFeatures\n",
       "                >> SelectFeatures(),\n",
       "            )\n",
       "        ]\n",
       "    )\n",
       "    >> pipeline_3\n",
       "    >> (LR | KNN | XGBoost)\n",
       ")\n",
       "```"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pipeline.pretty_print(ipython_display=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}