{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: liac-arff>=2.4.0 in /home/hirzel/python3.6venv/lib/python3.6/site-packages (2.4.0)\n", "\u001b[33mWARNING: You are using pip version 20.2.2; however, version 20.2.3 is available.\n", "You should consider upgrading via the '/home/hirzel/python3.6venv/bin/python3.6 -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "#Required for accessing openml datasets from Lale\n", "!pip install 'liac-arff>=2.4.0'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import lale.datasets.openml\n", "import pandas as pd\n", "(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(\n", " 'credit-g', 'classification', preprocess=True)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler as Standard, MinMaxScaler as MinMax\n", "from sklearn.decomposition import PCA\n", "from sklearn.kernel_approximation import Nystroem as Nys\n", "from lale.lib.lale import NoOp\n", "from sklearn.linear_model import LogisticRegression as LR\n", "from sklearn.ensemble import RandomForestClassifier as RF\n", "from xgboost import XGBClassifier as XGBoost\n", "from lale.lib.lale import TopKVotingClassifier\n", "from sklearn.metrics import accuracy_score\n", "from lale.lib.lale import Hyperopt\n", "from sklearn.ensemble import VotingClassifier\n", "lale.wrap_imported_operators()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The optimizer supported as of now is Hyperopt and args_to_optimizer\n", "is a dictionary of values that Hyperopt would accept as per\n", "https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.hyperopt.html" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "cluster:(root)\n", "\n", "\n", "TopKVotingClassifier\n", "\n", "\n", "\n", "cluster:pipeline\n", "\n", "\n", "\n", "\n", "\n", "cluster:choice\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_0\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_1\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "no_op_0\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "no_op->no_op_0\n", "\n", "\n", "\n", "\n", "standard\n", "\n", "\n", "Standard\n", "\n", "\n", "\n", "\n", "min_max\n", "\n", "\n", "Min-\n", "Max\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "no_op_0->lr\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "nys\n", "\n", "\n", "Nys\n", "\n", "\n", "\n", "\n", "rf\n", "\n", "\n", "RF\n", "\n", "\n", "\n", "\n", "xg_boost\n", "\n", "\n", "XG-\n", "Boost\n", "\n", "\n", "\n", "\n", "hyperopt\n", "\n", "\n", "Hyperopt\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "planned_pipeline = (NoOp | Standard | MinMax) >> (NoOp | PCA | Nys) >> (LR | RF | XGBoost)\n", "ensemble = TopKVotingClassifier(\n", " estimator=planned_pipeline, k=3, optimizer=Hyperopt,\n", " args_to_optimizer={'max_evals':25, 'scoring':'accuracy'})\n", "ensemble.visualize()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████████████████████████| 25/25 [02:51<00:00, 6.87s/trial, best loss: -0.7417910447761193]\n", "100%|█████████████████████████████████| 1/1 [01:23<00:00, 83.42s/trial, best loss: -0.755223880597015]\n" ] } ], "source": [ "trained_ensemble = ensemble.fit(train_X, train_y)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#Note that you could also pass just the planned pipeline as below and Hyperopt with its default setting would be used.\n", "ensemble = TopKVotingClassifier(estimator=planned_pipeline)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7696969696969697\n" ] } ], "source": [ "predictions = trained_ensemble.predict(test_X)\n", "print(accuracy_score(test_y, predictions))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "cluster:(root)\n", "\n", "\n", "VotingClassifier\n", "\n", "\n", "\n", "cluster:pipeline\n", "\n", "\n", "\n", "\n", "\n", "cluster:pipeline_0\n", "\n", "\n", "\n", "\n", "\n", "cluster:pipeline_1\n", "\n", "\n", "\n", "\n", "\n", "\n", "min_max\n", "\n", "\n", "Min-\n", "Max\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "min_max->no_op\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "no_op->lr\n", "\n", "\n", "\n", "\n", "standard\n", "\n", "\n", "Standard\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "standard->pca\n", "\n", "\n", "\n", "\n", "lr_0\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "pca->lr_0\n", "\n", "\n", "\n", "\n", "min_max_0\n", "\n", "\n", "Min-\n", "Max\n", "\n", "\n", "\n", "\n", "no_op_0\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "min_max_0->no_op_0\n", "\n", "\n", "\n", "\n", "xg_boost\n", "\n", "\n", "XG-\n", "Boost\n", "\n", "\n", "\n", "\n", "no_op_0->xg_boost\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "best_pipeline = trained_ensemble.get_pipeline()\n", "best_pipeline.visualize()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'class': 'lale.lib.sklearn.voting_classifier.VotingClassifierImpl',\n", " 'state': 'trained',\n", " 'operator': 'VotingClassifier',\n", " 'label': 'VotingClassifier',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.voting_classifier.html',\n", " 'hyperparams': {'estimators': [('p17', {'$ref': '../steps/pipeline'}),\n", " ('p9', {'$ref': '../steps/pipeline_0'}),\n", " ('p16', {'$ref': '../steps/pipeline_1'})],\n", " 'voting': 'soft'},\n", " 'steps': {'pipeline': {'class': 'lale.operators.TrainablePipeline',\n", " 'state': 'trainable',\n", " 'edges': [['min_max', 'no_op'], ['no_op', 'lr']],\n", " 'steps': {'min_max': {'class': 'lale.lib.sklearn.min_max_scaler.MinMaxScalerImpl',\n", " 'state': 'trainable',\n", " 'operator': 'MinMax',\n", " 'label': 'MinMax',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.min_max_scaler.html',\n", " 'hyperparams': {},\n", " 'is_frozen_trainable': True},\n", " 'no_op': {'class': 'lale.lib.lale.no_op.NoOpImpl',\n", " 'state': 'trained',\n", " 'operator': 'NoOp',\n", " 'label': 'NoOp',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.no_op.html',\n", " 'hyperparams': {},\n", " 'is_frozen_trainable': True,\n", " 'coefs': None,\n", " 'is_frozen_trained': True},\n", " 'lr': {'class': 'lale.lib.sklearn.logistic_regression.LogisticRegressionImpl',\n", " 'state': 'trainable',\n", " 'operator': 'LR',\n", " 'label': 'LR',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.logistic_regression.html',\n", " 'hyperparams': {'C': 22334.598583769228,\n", " 'fit_intercept': False,\n", " 'tol': 0.012327172789706938},\n", " 'is_frozen_trainable': True}}},\n", " 'pipeline_0': {'class': 'lale.operators.TrainablePipeline',\n", " 'state': 'trainable',\n", " 'edges': [['standard', 'pca'], ['pca', 'lr_0']],\n", " 'steps': {'standard': {'class': 'lale.lib.sklearn.standard_scaler.StandardScalerImpl',\n", " 'state': 'trainable',\n", " 'operator': 'Standard',\n", " 'label': 'Standard',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.standard_scaler.html',\n", " 'hyperparams': {'with_mean': False, 'with_std': False},\n", " 'is_frozen_trainable': True},\n", " 'pca': {'class': 'lale.lib.sklearn.pca.PCAImpl',\n", " 'state': 'trainable',\n", " 'operator': 'PCA',\n", " 'label': 'PCA',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.pca.html',\n", " 'hyperparams': {'svd_solver': 'randomized'},\n", " 'is_frozen_trainable': True},\n", " 'lr_0': {'class': 'lale.lib.sklearn.logistic_regression.LogisticRegressionImpl',\n", " 'state': 'trainable',\n", " 'operator': 'LR',\n", " 'label': 'LR',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.logistic_regression.html',\n", " 'hyperparams': {'C': 31347.843540495694,\n", " 'penalty': 'l1',\n", " 'tol': 0.03811266583210131},\n", " 'is_frozen_trainable': True}}},\n", " 'pipeline_1': {'class': 'lale.operators.TrainablePipeline',\n", " 'state': 'trainable',\n", " 'edges': [['min_max_0', 'no_op_0'], ['no_op_0', 'xg_boost']],\n", " 'steps': {'min_max_0': {'class': 'lale.lib.sklearn.min_max_scaler.MinMaxScalerImpl',\n", " 'state': 'trainable',\n", " 'operator': 'MinMax',\n", " 'label': 'MinMax',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.min_max_scaler.html',\n", " 'hyperparams': {},\n", " 'is_frozen_trainable': True},\n", " 'no_op_0': {'class': 'lale.lib.lale.no_op.NoOpImpl',\n", " 'state': 'trained',\n", " 'operator': 'NoOp',\n", " 'label': 'NoOp',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.no_op.html',\n", " 'hyperparams': {},\n", " 'is_frozen_trainable': True,\n", " 'coefs': None,\n", " 'is_frozen_trained': True},\n", " 'xg_boost': {'class': 'lale.lib.xgboost.xgb_classifier.XGBClassifierImpl',\n", " 'state': 'trainable',\n", " 'operator': 'XGBoost',\n", " 'label': 'XGBoost',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.xgboost.XGBClassifier.html',\n", " 'hyperparams': {'booster': 'dart',\n", " 'colsample_bylevel': 0.47096071538468853,\n", " 'colsample_bytree': 0.7593792234753081,\n", " 'learning_rate': 0.20970693320349945,\n", " 'max_depth': 18,\n", " 'min_child_weight': 2,\n", " 'n_estimators': 1063,\n", " 'reg_alpha': 0.31844178826811975,\n", " 'reg_lambda': 0.7797218315351517,\n", " 'subsample': 0.64136510248406},\n", " 'is_frozen_trainable': True}}}},\n", " 'is_frozen_trainable': True,\n", " 'coefs': 'coefs_not_available',\n", " 'is_frozen_trained': False}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "best_pipeline.to_json()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }