{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "from lale.lib.lale import NoOp\n", "from lale.lib.sklearn import KNeighborsClassifier\n", "from lale.lib.sklearn import LogisticRegression\n", "from lale.lib.sklearn import Nystroem\n", "from lale.lib.sklearn import PCA\n", "from lale.operators import make_union, make_choice, make_pipeline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Lale provides an `|` combinator or a function make_choice() to allow only one of its arguments to be applied at once in the overall pipeline. In this example, the first step of the pipeline is a choice between Nystroem and NoOp. This means that the data will either be transformed using Nystroem or will be left as is (NoOp is a transformer that does nothing). The second step in the pipeline is a PCA, and the third step is again a choice between two popular classifiers." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "cluster:(root)\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "nystroem\n", "\n", "\n", "Nystroem\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "kernel_tfm_or_not = NoOp | Nystroem\n", "kernel_tfm_or_not.visualize()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "tfm = PCA" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "cluster:(root)\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "logistic_regression\n", "\n", "\n", "Logistic-\n", "Regression\n", "\n", "\n", "\n", "\n", "k_neighbors_classifier\n", "\n", "\n", "K-\n", "Neighbors-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "clf = make_choice(LogisticRegression, KNeighborsClassifier)\n", "clf.visualize()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "cluster:choice_0\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_1\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "no_op->pca\n", "\n", "\n", "\n", "\n", "nystroem\n", "\n", "\n", "Nystroem\n", "\n", "\n", "\n", "\n", "logistic_regression\n", "\n", "\n", "Logistic-\n", "Regression\n", "\n", "\n", "\n", "\n", "pca->logistic_regression\n", "\n", "\n", "\n", "\n", "k_neighbors_classifier\n", "\n", "\n", "K-\n", "Neighbors-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "optimizable = kernel_tfm_or_not >> tfm >> clf\n", "optimizable.visualize()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Use the graph to select the best performing model for a dataset. We use Iris dataset from sklearn for this demonstration. Hyperopt is used to scan the hyperparameter search space and select the best performing path from the above graph. " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from lale.lib.lale import Hyperopt\n", "from lale.datasets import load_iris_df\n", "\n", "(X_train, y_train), (X_test, y_test) = load_iris_df()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|█████████| 3/3 [00:07<00:00, 2.64s/trial, best loss: -0.9416666666666667]\n" ] } ], "source": [ "hpo_trainable = Hyperopt(estimator=optimizable, max_evals=3)\n", "hpo_trained = hpo_trainable.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "nystroem\n", "\n", "\n", "Nystroem\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "nystroem->pca\n", "\n", "\n", "\n", "\n", "logistic_regression\n", "\n", "\n", "Logistic-\n", "Regression\n", "\n", "\n", "\n", "\n", "pca->logistic_regression\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "best_estimator = hpo_trained.get_pipeline()\n", "best_estimator.visualize()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Look at the results from all trials and retrieve pipelines of other names or types." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tidlosstimelog_lossstatus
name
p00-0.5833330.4618731.955799ok
p11-0.9416670.4361480.249560ok
p22-0.7916670.2741120.390846ok
\n", "
" ], "text/plain": [ " tid loss time log_loss status\n", "name \n", "p0 0 -0.583333 0.461873 1.955799 ok\n", "p1 1 -0.941667 0.436148 0.249560 ok\n", "p2 2 -0.791667 0.274112 0.390846 ok" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hpo_trained.summary()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "p0\n" ] } ], "source": [ "worst_name = hpo_trained.summary().loss.argmax()\n", "if not isinstance(worst_name, str): #newer pandas argmax returns index\n", " worst_name = hpo_trained.summary().index[worst_name]\n", "print(worst_name)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "nystroem\n", "\n", "\n", "Nystroem\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "nystroem->pca\n", "\n", "\n", "\n", "\n", "k_neighbors_classifier\n", "\n", "\n", "K-\n", "Neighbors-\n", "Classifier\n", "\n", "\n", "\n", "\n", "pca->k_neighbors_classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "worst_estimator = hpo_trained.get_pipeline(worst_name)\n", "worst_estimator.visualize()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "worst_estimator_in_sklearn_format = hpo_trained.get_pipeline(worst_name, astype='sklearn')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }