{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "from lale.lib.lale import NoOp\n", "from lale.lib.sklearn import KNeighborsClassifier\n", "from lale.lib.sklearn import LogisticRegression\n", "from lale.lib.sklearn import Nystroem\n", "from lale.lib.sklearn import PCA\n", "from lale.operators import make_union, make_choice, make_pipeline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Lale provides an `|` combinator or a function make_choice() to allow only one of its arguments to be applied at once in the overall pipeline. In this example, the first step of the pipeline is a choice between Nystroem and NoOp. This means that the data will either be transformed using Nystroem or will be left as is (NoOp is a transformer that does nothing). The second step in the pipeline is a PCA, and the third step is again a choice between two popular classifiers." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "\n", "nystroem\n", "\n", "\n", "Nystroem\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "kernel_tfm_or_not = NoOp | Nystroem\n", "kernel_tfm_or_not.visualize()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "tfm = PCA" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "\n", "logistic_regression\n", "\n", "\n", "Logistic-\n", "Regression\n", "\n", "\n", "\n", "\n", "\n", "k_neighbors_classifier\n", "\n", "\n", "K-\n", "Neighbors-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "clf = make_choice(LogisticRegression, KNeighborsClassifier)\n", "clf.visualize()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "cluster:choice_0\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "cluster:choice_1\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "\n", "no_op->pca\n", "\n", "\n", "\n", "\n", "\n", "nystroem\n", "\n", "\n", "Nystroem\n", "\n", "\n", "\n", "\n", "\n", "logistic_regression\n", "\n", "\n", "Logistic-\n", "Regression\n", "\n", "\n", "\n", "\n", "\n", "pca->logistic_regression\n", "\n", "\n", "\n", "\n", "\n", "k_neighbors_classifier\n", "\n", "\n", "K-\n", "Neighbors-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "optimizable = kernel_tfm_or_not >> tfm >> clf\n", "optimizable.visualize()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Use the graph to select the best performing model for a dataset. We use Iris dataset from sklearn for this demonstration. Hyperopt is used to scan the hyperparameter search space and select the best performing path from the above graph. " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from lale.lib.lale import Hyperopt\n", "from lale.datasets import load_iris_df\n", "\n", "(X_train, y_train), (X_test, y_test) = load_iris_df()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:06<00:00, 2.16s/it, best loss: -0.7847272727272727]\n" ] } ], "source": [ "hpo_trainable = Hyperopt(estimator=optimizable, max_evals=3)\n", "hpo_trained = hpo_trainable.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "\n", "no_op->pca\n", "\n", "\n", "\n", "\n", "\n", "logistic_regression\n", "\n", "\n", "Logistic-\n", "Regression\n", "\n", "\n", "\n", "\n", "\n", "pca->logistic_regression\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "best_estimator = hpo_trained.get_pipeline()\n", "best_estimator.visualize()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Look at the results from all trials and retrieve pipelines of other names or types." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tidlosstimelog_lossstatus
name
p00-0.3580610.3179981.365361ok
p11-0.3270610.30371817.448898ok
p22-0.7847270.2790180.374613ok
\n", "
" ], "text/plain": [ " tid loss time log_loss status\n", "name \n", "p0 0 -0.358061 0.317998 1.365361 ok\n", "p1 1 -0.327061 0.303718 17.448898 ok\n", "p2 2 -0.784727 0.279018 0.374613 ok" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hpo_trained.summary()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "p1\n" ] } ], "source": [ "worst_name = hpo_trained.summary().loss.argmax()\n", "print(worst_name)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "\n", "nystroem\n", "\n", "\n", "Nystroem\n", "\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "\n", "nystroem->pca\n", "\n", "\n", "\n", "\n", "\n", "logistic_regression\n", "\n", "\n", "Logistic-\n", "Regression\n", "\n", "\n", "\n", "\n", "\n", "pca->logistic_regression\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "worst_estimator = hpo_trained.get_pipeline(worst_name)\n", "worst_estimator.visualize()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "#FIXME\n", "#worst_estimator_in_sklearn_format = hpo_trained.get_pipeline(worst_name, astype='sklearn')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }