{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"from lale.lib.lale import NoOp\n",
"from lale.lib.sklearn import KNeighborsClassifier\n",
"from lale.lib.sklearn import LogisticRegression\n",
"from lale.lib.sklearn import Nystroem\n",
"from lale.lib.sklearn import PCA\n",
"from lale.operators import make_union, make_choice, make_pipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Lale provides an `|` combinator or a function make_choice() to allow only one of its arguments to be applied at once in the overall pipeline. In this example, the first step of the pipeline is a choice between Nystroem and NoOp. This means that the data will either be transformed using Nystroem or will be left as is (NoOp is a transformer that does nothing). The second step in the pipeline is a PCA, and the third step is again a choice between two popular classifiers."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"kernel_tfm_or_not = NoOp | Nystroem\n",
"kernel_tfm_or_not.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"tfm = PCA"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"clf = make_choice(LogisticRegression, KNeighborsClassifier)\n",
"clf.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"optimizable = kernel_tfm_or_not >> tfm >> clf\n",
"optimizable.visualize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Use the graph to select the best performing model for a dataset. We use Iris dataset from sklearn for this demonstration. Hyperopt is used to scan the hyperparameter search space and select the best performing path from the above graph. "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from lale.lib.lale import Hyperopt\n",
"from lale.datasets import load_iris_df\n",
"\n",
"(X_train, y_train), (X_test, y_test) = load_iris_df()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|█████████| 3/3 [00:07<00:00, 2.64s/trial, best loss: -0.9416666666666667]\n"
]
}
],
"source": [
"hpo_trainable = Hyperopt(estimator=optimizable, max_evals=3)\n",
"hpo_trained = hpo_trainable.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"best_estimator = hpo_trained.get_pipeline()\n",
"best_estimator.visualize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Look at the results from all trials and retrieve pipelines of other names or types."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tid | \n",
" loss | \n",
" time | \n",
" log_loss | \n",
" status | \n",
"
\n",
" \n",
" name | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" p0 | \n",
" 0 | \n",
" -0.583333 | \n",
" 0.461873 | \n",
" 1.955799 | \n",
" ok | \n",
"
\n",
" \n",
" p1 | \n",
" 1 | \n",
" -0.941667 | \n",
" 0.436148 | \n",
" 0.249560 | \n",
" ok | \n",
"
\n",
" \n",
" p2 | \n",
" 2 | \n",
" -0.791667 | \n",
" 0.274112 | \n",
" 0.390846 | \n",
" ok | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tid loss time log_loss status\n",
"name \n",
"p0 0 -0.583333 0.461873 1.955799 ok\n",
"p1 1 -0.941667 0.436148 0.249560 ok\n",
"p2 2 -0.791667 0.274112 0.390846 ok"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hpo_trained.summary()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"p0\n"
]
}
],
"source": [
"worst_name = hpo_trained.summary().loss.argmax()\n",
"if not isinstance(worst_name, str): #newer pandas argmax returns index\n",
" worst_name = hpo_trained.summary().index[worst_name]\n",
"print(worst_name)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"worst_estimator = hpo_trained.get_pipeline(worst_name)\n",
"worst_estimator.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"worst_estimator_in_sklearn_format = hpo_trained.get_pipeline(worst_name, astype='sklearn')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}