{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: liac-arff>=2.4.0 in /home/hirzel/python3.6venv/lib/python3.6/site-packages (2.4.0)\r\n" ] } ], "source": [ "#Required for accessing openml datasets from Lale\n", "!pip install 'liac-arff>=2.4.0'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import lale.datasets.openml\n", "import pandas as pd\n", "(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(\n", " 'credit-g', 'classification', preprocess=True)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler as Standard, MinMaxScaler as MinMax\n", "from sklearn.decomposition import PCA\n", "from sklearn.kernel_approximation import Nystroem as Nys\n", "from lale.lib.lale import NoOp\n", "from sklearn.linear_model import LogisticRegression as LR\n", "from sklearn.ensemble import RandomForestClassifier as RF\n", "from xgboost import XGBClassifier as XGBoost\n", "from lale.lib.lale import TopKVotingClassifier\n", "from sklearn.metrics import accuracy_score\n", "from lale.lib.lale import Hyperopt\n", "from sklearn.ensemble import VotingClassifier\n", "lale.wrap_imported_operators()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The optimizer supported as of now is Hyperopt and args_to_optimizer\n", "is a dictionary of values that Hyperopt would accept as per\n", "https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.hyperopt.html" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "cluster:(root)\n", "\n", "\n", "TopKVotingClassifier\n", "\n", "\n", "\n", "cluster:pipeline\n", "\n", "\n", "\n", "\n", "\n", "cluster:choice\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_0\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_1\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "no_op_0\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "no_op->no_op_0\n", "\n", "\n", "\n", "\n", "standard\n", "\n", "\n", "Standard\n", "\n", "\n", "\n", "\n", "min_max\n", "\n", "\n", "Min-\n", "Max\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "no_op_0->lr\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "nys\n", "\n", "\n", "Nys\n", "\n", "\n", "\n", "\n", "rf\n", "\n", "\n", "RF\n", "\n", "\n", "\n", "\n", "xg_boost\n", "\n", "\n", "XG-\n", "Boost\n", "\n", "\n", "\n", "\n", "hyperopt\n", "\n", "\n", "Hyperopt\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "planned_pipeline = (NoOp | Standard | MinMax) >> (NoOp | PCA | Nys) >> (LR | RF | XGBoost)\n", "ensemble = TopKVotingClassifier(\n", " estimator=planned_pipeline, k=3, optimizer=Hyperopt,\n", " args_to_optimizer={'max_evals':25, 'scoring':'accuracy'})\n", "ensemble.visualize()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|███████| 25/25 [03:15<00:00, 4.04s/trial, best loss: -0.7448038005461415]\n", "100%|█████████| 1/1 [00:06<00:00, 6.63s/trial, best loss: -0.7508298939720779]\n" ] } ], "source": [ "trained_ensemble = ensemble.fit(train_X, train_y)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#Note that you could also pass just the planned pipeline as below and Hyperopt with its default setting would be used.\n", "ensemble = TopKVotingClassifier(estimator=planned_pipeline)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7515151515151515\n" ] } ], "source": [ "predictions = trained_ensemble.predict(test_X)\n", "print(accuracy_score(test_y, predictions))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "cluster:(root)\n", "\n", "\n", "VotingClassifier\n", "\n", "\n", "\n", "cluster:pipeline\n", "\n", "\n", "\n", "\n", "\n", "cluster:pipeline_0\n", "\n", "\n", "\n", "\n", "\n", "cluster:pipeline_1\n", "\n", "\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "no_op->pca\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "pca->lr\n", "\n", "\n", "\n", "\n", "no_op_0\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "no_op_1\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "no_op_0->no_op_1\n", "\n", "\n", "\n", "\n", "lr_0\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "no_op_1->lr_0\n", "\n", "\n", "\n", "\n", "min_max\n", "\n", "\n", "Min-\n", "Max\n", "\n", "\n", "\n", "\n", "no_op_2\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "min_max->no_op_2\n", "\n", "\n", "\n", "\n", "lr_1\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "no_op_2->lr_1\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "best_pipeline = trained_ensemble.get_pipeline()\n", "best_pipeline.visualize()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'class': 'lale.lib.sklearn.voting_classifier.VotingClassifierImpl',\n", " 'state': 'trained',\n", " 'operator': 'VotingClassifier',\n", " 'label': 'VotingClassifier',\n", " 'documentation_url': 'https://scikit-learn.org/0.20/modules/generated/sklearn.ensemble.VotingClassifier.html#sklearn-ensemble-votingclassifier',\n", " 'hyperparams': {'estimators': [('p14', {'$ref': '../steps/pipeline'}),\n", " ('p21', {'$ref': '../steps/pipeline_0'}),\n", " ('p11', {'$ref': '../steps/pipeline_1'})],\n", " 'voting': 'soft'},\n", " 'steps': {'pipeline': {'class': 'lale.operators.TrainablePipeline',\n", " 'state': 'trainable',\n", " 'edges': [['no_op', 'pca'], ['pca', 'lr']],\n", " 'steps': {'no_op': {'class': 'lale.lib.lale.no_op.NoOpImpl',\n", " 'state': 'trained',\n", " 'operator': 'NoOp',\n", " 'label': 'NoOp',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.no_op.html',\n", " 'hyperparams': {},\n", " 'is_frozen_trainable': True,\n", " 'coefs': None,\n", " 'is_frozen_trained': True},\n", " 'pca': {'class': 'lale.lib.sklearn.pca.PCAImpl',\n", " 'state': 'trainable',\n", " 'operator': 'PCA',\n", " 'label': 'PCA',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.pca.html',\n", " 'hyperparams': {'svd_solver': 'randomized'},\n", " 'is_frozen_trainable': True},\n", " 'lr': {'class': 'lale.lib.sklearn.logistic_regression.LogisticRegressionImpl',\n", " 'state': 'trainable',\n", " 'operator': 'LR',\n", " 'label': 'LR',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.logistic_regression.html',\n", " 'hyperparams': {'C': 32591.329013327737,\n", " 'penalty': 'l1',\n", " 'tol': 0.04931166736770484},\n", " 'is_frozen_trainable': True}}},\n", " 'pipeline_0': {'class': 'lale.operators.TrainablePipeline',\n", " 'state': 'trainable',\n", " 'edges': [['no_op_0', 'no_op_1'], ['no_op_1', 'lr_0']],\n", " 'steps': {'no_op_0': {'class': 'lale.lib.lale.no_op.NoOpImpl',\n", " 'state': 'trained',\n", " 'operator': 'NoOp',\n", " 'label': 'NoOp',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.no_op.html',\n", " 'hyperparams': {},\n", " 'is_frozen_trainable': True,\n", " 'coefs': None,\n", " 'is_frozen_trained': True},\n", " 'no_op_1': {'class': 'lale.lib.lale.no_op.NoOpImpl',\n", " 'state': 'trained',\n", " 'operator': 'NoOp',\n", " 'label': 'NoOp',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.no_op.html',\n", " 'hyperparams': {},\n", " 'is_frozen_trainable': True,\n", " 'coefs': None,\n", " 'is_frozen_trained': True},\n", " 'lr_0': {'class': 'lale.lib.sklearn.logistic_regression.LogisticRegressionImpl',\n", " 'state': 'trainable',\n", " 'operator': 'LR',\n", " 'label': 'LR',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.logistic_regression.html',\n", " 'hyperparams': {'C': 32762.313084225098,\n", " 'penalty': 'l1',\n", " 'tol': 0.04485831858516044},\n", " 'is_frozen_trainable': True}}},\n", " 'pipeline_1': {'class': 'lale.operators.TrainablePipeline',\n", " 'state': 'trainable',\n", " 'edges': [['min_max', 'no_op_2'], ['no_op_2', 'lr_1']],\n", " 'steps': {'min_max': {'class': 'lale.lib.sklearn.min_max_scaler.MinMaxScalerImpl',\n", " 'state': 'trainable',\n", " 'operator': 'MinMax',\n", " 'label': 'MinMax',\n", " 'documentation_url': 'https://scikit-learn.org/0.20/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn-preprocessing-minmaxscaler',\n", " 'hyperparams': {},\n", " 'is_frozen_trainable': True},\n", " 'no_op_2': {'class': 'lale.lib.lale.no_op.NoOpImpl',\n", " 'state': 'trained',\n", " 'operator': 'NoOp',\n", " 'label': 'NoOp',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.no_op.html',\n", " 'hyperparams': {},\n", " 'is_frozen_trainable': True,\n", " 'coefs': None,\n", " 'is_frozen_trained': True},\n", " 'lr_1': {'class': 'lale.lib.sklearn.logistic_regression.LogisticRegressionImpl',\n", " 'state': 'trainable',\n", " 'operator': 'LR',\n", " 'label': 'LR',\n", " 'documentation_url': 'https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.logistic_regression.html',\n", " 'hyperparams': {'C': 20782.604482916624,\n", " 'solver': 'lbfgs',\n", " 'tol': 0.051260844412032186},\n", " 'is_frozen_trainable': True}}}},\n", " 'is_frozen_trainable': True,\n", " 'coefs': 'coefs_not_available',\n", " 'is_frozen_trained': False}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "best_pipeline.to_json()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }