{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Lale: Type-Driven Auto-ML with Scikit-Learn\n", "\n", "### https://github.com/ibm/lale\n", "\n", "### Example Dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: liac-arff>=2.4.0 in /home/hirzel/python3.6venv/lib/python3.6/site-packages (2.4.0)\r\n" ] } ], "source": [ "!pip install 'liac-arff>=2.4.0'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ychecking_statusdurationcredit_historypurposecredit_amountsavings_statusemploymentinstallment_commitmentpersonal_status...residence_sinceproperty_magnitudeageother_payment_planshousingexisting_creditsjobnum_dependentsown_telephoneforeign_worker
8350<012.0no credits/all paidnew car1082.0<1001<=X<44.0male single...4.0car48.0bankown2.0skilled1.0noneyes
19200<=X<20027.0existing paidbusiness3915.0<1001<=X<44.0male single...2.0car36.0noneown1.0skilled2.0yesyes
6291no checking9.0existing paideducation3832.0no known savings>=71.0male single...4.0real estate64.0noneown1.0unskilled resident1.0noneyes
55900<=X<20018.0critical/other existing creditfurniture/equipment1928.0<100<12.0male single...2.0real estate31.0noneown2.0unskilled resident1.0noneyes
68410<=X<20036.0delayed previouslybusiness9857.0100<=X<5004<=X<71.0male single...3.0life insurance31.0noneown2.0unskilled resident2.0yesyes
\n", "

5 rows × 21 columns

\n", "
" ], "text/plain": [ " y checking_status duration credit_history \\\n", "835 0 <0 12.0 no credits/all paid \n", "192 0 0<=X<200 27.0 existing paid \n", "629 1 no checking 9.0 existing paid \n", "559 0 0<=X<200 18.0 critical/other existing credit \n", "684 1 0<=X<200 36.0 delayed previously \n", "\n", " purpose credit_amount savings_status employment \\\n", "835 new car 1082.0 <100 1<=X<4 \n", "192 business 3915.0 <100 1<=X<4 \n", "629 education 3832.0 no known savings >=7 \n", "559 furniture/equipment 1928.0 <100 <1 \n", "684 business 9857.0 100<=X<500 4<=X<7 \n", "\n", " installment_commitment personal_status ... residence_since \\\n", "835 4.0 male single ... 4.0 \n", "192 4.0 male single ... 2.0 \n", "629 1.0 male single ... 4.0 \n", "559 2.0 male single ... 2.0 \n", "684 1.0 male single ... 3.0 \n", "\n", " property_magnitude age other_payment_plans housing existing_credits \\\n", "835 car 48.0 bank own 2.0 \n", "192 car 36.0 none own 1.0 \n", "629 real estate 64.0 none own 1.0 \n", "559 real estate 31.0 none own 2.0 \n", "684 life insurance 31.0 none own 2.0 \n", "\n", " job num_dependents own_telephone foreign_worker \n", "835 skilled 1.0 none yes \n", "192 skilled 2.0 yes yes \n", "629 unskilled resident 1.0 none yes \n", "559 unskilled resident 1.0 none yes \n", "684 unskilled resident 2.0 yes yes \n", "\n", "[5 rows x 21 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import lale.datasets.openml\n", "import pandas as pd\n", "(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(\n", " 'credit-g', 'classification', preprocess=False)\n", "pd.concat([pd.DataFrame({'y': train_y}, index=train_X.index).tail(),\n", " train_X.tail()], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Algorithm Selection and Hyperparameter Tuning" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import Normalizer as Norm\n", "from sklearn.preprocessing import OneHotEncoder as OneHot\n", "from sklearn.linear_model import LogisticRegression as LR\n", "from xgboost import XGBClassifier as XGBoost\n", "from sklearn.svm import LinearSVC\n", "from lale.operators import make_pipeline, make_union\n", "from lale.lib.lale import Project, ConcatFeatures, NoOp\n", "lale.wrap_imported_operators()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "cluster:choice_0\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_1\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "project_0\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "norm\n", "\n", "\n", "Norm\n", "\n", "\n", "\n", "\n", "project_0->norm\n", "\n", "\n", "\n", "\n", "concat_features\n", "\n", "\n", "Concat-\n", "Features\n", "\n", "\n", "\n", "\n", "norm->concat_features\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "project_1\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "one_hot\n", "\n", "\n", "One-\n", "Hot\n", "\n", "\n", "\n", "\n", "project_1->one_hot\n", "\n", "\n", "\n", "\n", "one_hot->concat_features\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "concat_features->lr\n", "\n", "\n", "\n", "\n", "linear_svc\n", "\n", "\n", "Linear-\n", "SVC\n", "\n", "\n", "\n", "\n", "xg_boost\n", "\n", "\n", "XG-\n", "Boost\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "project_nums = Project(columns={'type': 'number'})\n", "project_cats = Project(columns={'type': 'string'})\n", "planned_pipeline = (\n", " (project_nums >> (Norm | NoOp) & project_cats >> OneHot)\n", " >> ConcatFeatures\n", " >> (LR | LinearSVC(dual=False)| XGBoost))\n", "planned_pipeline.visualize()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|████████████| 5/5 [00:47<00:00, 9.36s/it, best loss: -0.7507273649370062]\n", "accuracy 72.1%\n" ] } ], "source": [ "import sklearn.metrics\n", "from lale.lib.lale import Hyperopt\n", "auto_optimizer = Hyperopt(estimator=planned_pipeline, cv=3, max_evals=5)\n", "auto_trained = auto_optimizer.fit(train_X, train_y)\n", "auto_y = auto_trained.predict(test_X)\n", "print(f'accuracy {sklearn.metrics.accuracy_score(test_y, auto_y):.1%}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Displaying Automation Results" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "project_0\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "norm\n", "\n", "\n", "Norm\n", "\n", "\n", "\n", "\n", "project_0->norm\n", "\n", "\n", "\n", "\n", "concat_features\n", "\n", "\n", "Concat-\n", "Features\n", "\n", "\n", "\n", "\n", "norm->concat_features\n", "\n", "\n", "\n", "\n", "project_1\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "one_hot\n", "\n", "\n", "One-\n", "Hot\n", "\n", "\n", "\n", "\n", "project_1->one_hot\n", "\n", "\n", "\n", "\n", "one_hot->concat_features\n", "\n", "\n", "\n", "\n", "linear_svc\n", "\n", "\n", "Linear-\n", "SVC\n", "\n", "\n", "\n", "\n", "concat_features->linear_svc\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "best_pipeline = auto_trained.get_pipeline()\n", "best_pipeline.visualize()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "```python\n", "project_0 = Project(columns={'type': 'number'})\n", "norm = Norm(norm='l1')\n", "project_1 = Project(columns={'type': 'string'})\n", "linear_svc = LinearSVC(dual=False, C=9773.459065896624, tol=0.0006905227182226334)\n", "pipeline = ((project_0 >> norm) & (project_1 >> OneHot())) >> ConcatFeatures() >> linear_svc\n", "```" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from lale.pretty_print import ipython_display\n", "ipython_display(best_pipeline, show_imports=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### JSON Schemas\n", "\n", "https://json-schema.org/" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "```python\n", "{\n", " 'description': 'Number of trees to fit.',\n", " 'type': 'integer',\n", " 'default': 100,\n", " 'minimumForOptimizer': 10,\n", " 'maximumForOptimizer': 1500}\n", "```" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "ipython_display(XGBoost.hyperparam_schema('n_estimators'))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "```python\n", "{\n", " 'description': 'Specify which booster to use.',\n", " 'enum': ['gbtree', 'gblinear', 'dart'],\n", " 'default': 'gbtree'}\n", "```" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "ipython_display(XGBoost.hyperparam_schema('booster'))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Invalid configuration for XGBoost(n_estimators=0.5, booster='gbtree') due to invalid value n_estimators=0.5.\n", "Schema of argument n_estimators: {\n", " 'description': 'Number of trees to fit.',\n", " 'type': 'integer',\n", " 'default': 100,\n", " 'minimumForOptimizer': 10,\n", " 'maximumForOptimizer': 1500}\n", "Value: 0.5\n" ] } ], "source": [ "import jsonschema\n", "import sys\n", "try:\n", " XGBoost(n_estimators=0.5, booster='gbtree')\n", "except jsonschema.ValidationError as e:\n", " print(e.message, file=sys.stderr)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Customizing Schemas" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import lale.schemas as schemas\n", "Grove = XGBoost.customize_schema(\n", " n_estimators=schemas.Int(min=2, max=10),\n", " booster=schemas.Enum(['gbtree']))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "grove_planned = ( Project(columns={'type': 'number'}) >> Norm\n", " & Project(columns={'type': 'string'}) >> OneHot\n", " ) >> ConcatFeatures >> Grove" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████| 10/10 [01:16<00:00, 7.70s/it, best loss: -0.7418455050181508]\n", "accuracy 75.5%\n" ] } ], "source": [ "grove_optimizer = Hyperopt(estimator=grove_planned, cv=3, max_evals=10)\n", "grove_trained = grove_optimizer.fit(train_X, train_y)\n", "grove_y = grove_trained.predict(test_X)\n", "print(f'accuracy {sklearn.metrics.accuracy_score(test_y, grove_y):.1%}')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "```python\n", "project_0 = Project(columns={'type': 'number'})\n", "norm = Norm(norm='max')\n", "project_1 = Project(columns={'type': 'string'})\n", "xg_boost = XGBoost(booster='gbtree', colsample_bylevel=0.9207359753561951, colsample_bytree=0.8853673179789476, learning_rate=0.7582425275075225, min_child_weight=11, n_estimators=8, reg_alpha=0.5980470775121279, reg_lambda=0.2546844052569046, subsample=0.8142720284737895)\n", "pipeline = ((project_0 >> norm) & (project_1 >> OneHot())) >> ConcatFeatures() >> xg_boost\n", "```" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "grove_best = grove_trained.get_pipeline()\n", "ipython_display(grove_best, show_imports=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Human-in-the-Loop Auto-ML\n", "\n", "### https://github.com/ibm/lale" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }