{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudetarget
03.259633.05.0176571.0064212300.03.69181432.71-117.031.030
13.812549.04.4735451.0410051314.01.73809533.77-118.163.821
24.15634.05.6458330.985119915.02.72321434.66-120.481.726
31.942536.04.0028171.0338031418.03.99436632.69-117.110.934
43.554243.06.2684211.134211874.02.30000036.78-119.800.965
\n", "
" ], "text/plain": [ " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", "0 3.2596 33.0 5.017657 1.006421 2300.0 3.691814 32.71 \n", "1 3.8125 49.0 4.473545 1.041005 1314.0 1.738095 33.77 \n", "2 4.1563 4.0 5.645833 0.985119 915.0 2.723214 34.66 \n", "3 1.9425 36.0 4.002817 1.033803 1418.0 3.994366 32.69 \n", "4 3.5542 43.0 6.268421 1.134211 874.0 2.300000 36.78 \n", "\n", " Longitude target \n", "0 -117.03 1.030 \n", "1 -118.16 3.821 \n", "2 -120.48 1.726 \n", "3 -117.11 0.934 \n", "4 -119.80 0.965 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import lale.datasets\n", "(train_X, train_y), (test_X, test_y) = lale.datasets.california_housing_df()\n", "pd.concat([train_X.head(), train_y.head()], axis=1)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler as Scale\n", "from sklearn.preprocessing import Normalizer as Norm\n", "from lale.lib.lale import NoOp\n", "from sklearn.decomposition import PCA\n", "from sklearn.tree import DecisionTreeRegressor as Tree\n", "from sklearn.linear_model import LinearRegression as Linear\n", "from xgboost import XGBRegressor as XGB\n", "lale.wrap_imported_operators()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "cluster:choice_0\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_1\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_2\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "scale\n", "\n", "\n", "Scale\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "scale->pca\n", "\n", "\n", "\n", "\n", "norm\n", "\n", "\n", "Norm\n", "\n", "\n", "\n", "\n", "no_op_0\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "tree\n", "\n", "\n", "Tree\n", "\n", "\n", "\n", "\n", "pca->tree\n", "\n", "\n", "\n", "\n", "no_op_1\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "linear\n", "\n", "\n", "Linear\n", "\n", "\n", "\n", "\n", "xgb\n", "\n", "\n", "XGB\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "planned_pipeline = (Scale | Norm | NoOp) >> (PCA | NoOp) >> (Tree | Linear | XGB)\n", "planned_pipeline.visualize()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 68%|████▊ | 34/50 [03:11<01:29, 5.62s/trial, best loss: -0.6110921251096775]\n" ] } ], "source": [ "from lale.lib.lale import Hyperopt\n", "import sklearn.metrics\n", "r2 = sklearn.metrics.make_scorer(sklearn.metrics.r2_score)\n", "trained_pipeline = planned_pipeline.auto_configure(\n", " train_X, train_y, optimizer=Hyperopt, scoring=r2,\n", " max_opt_time=3*60, max_eval_time=30, cv=3)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "R2 score: 0.58\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "no_op->pca\n", "\n", "\n", "\n", "\n", "linear\n", "\n", "\n", "Linear\n", "\n", "\n", "\n", "\n", "pca->linear\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print(f'R2 score: {r2(trained_pipeline, test_X, test_y):.2f}')\n", "trained_pipeline.visualize()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "```python\n", "from lale.lib.lale import NoOp\n", "from sklearn.decomposition import PCA\n", "from sklearn.linear_model import LinearRegression as Linear\n", "import lale\n", "\n", "lale.wrap_imported_operators()\n", "pca = PCA(svd_solver=\"randomized\")\n", "linear = Linear(normalize=True)\n", "pipeline = NoOp() >> pca >> linear\n", "```" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trained_pipeline.pretty_print(ipython_display=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }