{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Example Dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: liac-arff>=2.4.0 in /home/hirzel/python3.6venv/lib/python3.6/site-packages (2.4.0)\r\n" ] } ], "source": [ "!pip install 'liac-arff>=2.4.0'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ychecking_statusdurationcredit_historypurposecredit_amountsavings_statusemploymentinstallment_commitmentpersonal_status...residence_sinceproperty_magnitudeageother_payment_planshousingexisting_creditsjobnum_dependentsown_telephoneforeign_worker
8350<012.0no credits/all paidnew car1082.0<1001<=X<44.0male single...4.0car48.0bankown2.0skilled1.0noneyes
19200<=X<20027.0existing paidbusiness3915.0<1001<=X<44.0male single...2.0car36.0noneown1.0skilled2.0yesyes
6291no checking9.0existing paideducation3832.0no known savings>=71.0male single...4.0real estate64.0noneown1.0unskilled resident1.0noneyes
55900<=X<20018.0critical/other existing creditfurniture/equipment1928.0<100<12.0male single...2.0real estate31.0noneown2.0unskilled resident1.0noneyes
68410<=X<20036.0delayed previouslybusiness9857.0100<=X<5004<=X<71.0male single...3.0life insurance31.0noneown2.0unskilled resident2.0yesyes
\n", "

5 rows × 21 columns

\n", "
" ], "text/plain": [ " y checking_status duration credit_history \\\n", "835 0 <0 12.0 no credits/all paid \n", "192 0 0<=X<200 27.0 existing paid \n", "629 1 no checking 9.0 existing paid \n", "559 0 0<=X<200 18.0 critical/other existing credit \n", "684 1 0<=X<200 36.0 delayed previously \n", "\n", " purpose credit_amount savings_status employment \\\n", "835 new car 1082.0 <100 1<=X<4 \n", "192 business 3915.0 <100 1<=X<4 \n", "629 education 3832.0 no known savings >=7 \n", "559 furniture/equipment 1928.0 <100 <1 \n", "684 business 9857.0 100<=X<500 4<=X<7 \n", "\n", " installment_commitment personal_status ... residence_since \\\n", "835 4.0 male single ... 4.0 \n", "192 4.0 male single ... 2.0 \n", "629 1.0 male single ... 4.0 \n", "559 2.0 male single ... 2.0 \n", "684 1.0 male single ... 3.0 \n", "\n", " property_magnitude age other_payment_plans housing existing_credits \\\n", "835 car 48.0 bank own 2.0 \n", "192 car 36.0 none own 1.0 \n", "629 real estate 64.0 none own 1.0 \n", "559 real estate 31.0 none own 2.0 \n", "684 life insurance 31.0 none own 2.0 \n", "\n", " job num_dependents own_telephone foreign_worker \n", "835 skilled 1.0 none yes \n", "192 skilled 2.0 yes yes \n", "629 unskilled resident 1.0 none yes \n", "559 unskilled resident 1.0 none yes \n", "684 unskilled resident 2.0 yes yes \n", "\n", "[5 rows x 21 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import lale.datasets.openml\n", "import pandas as pd\n", "(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(\n", " 'credit-g', 'classification', preprocess=False)\n", "pd.concat([pd.DataFrame({'y': train_y}, index=train_X.index).tail(),\n", " train_X.tail()], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Algorithm Selection and Hyperparameter Tuning" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import Normalizer as Norm\n", "from sklearn.preprocessing import OneHotEncoder as OneHot\n", "from lale.lib.lale import Project, ConcatFeatures, NoOp\n", "from sklearn.linear_model import LogisticRegression as LR\n", "from sklearn.svm import LinearSVC\n", "from xgboost import XGBClassifier as XGBoost\n", "from lale import wrap_imported_operators\n", "wrap_imported_operators()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "cluster:choice_0\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_1\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "project_0\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "norm\n", "\n", "\n", "Norm\n", "\n", "\n", "\n", "\n", "project_0->norm\n", "\n", "\n", "\n", "\n", "concat_features\n", "\n", "\n", "Concat-\n", "Features\n", "\n", "\n", "\n", "\n", "norm->concat_features\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "project_1\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "one_hot\n", "\n", "\n", "One-\n", "Hot\n", "\n", "\n", "\n", "\n", "project_1->one_hot\n", "\n", "\n", "\n", "\n", "one_hot->concat_features\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "concat_features->lr\n", "\n", "\n", "\n", "\n", "linear_svc\n", "\n", "\n", "Linear-\n", "SVC\n", "\n", "\n", "\n", "\n", "xg_boost\n", "\n", "\n", "XG-\n", "Boost\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from lale.operators import make_pipeline, make_union\n", "\n", "planned_pipeline = make_pipeline(\n", " make_union(\n", " make_pipeline(Project(columns={'type': 'number'}), Norm | NoOp),\n", " make_pipeline(Project(columns={'type': 'string'}), OneHot)),\n", " LR | LinearSVC(dual=False)| XGBoost)\n", "planned_pipeline.visualize()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████| 10/10 [02:23<00:00, 14.32s/it, best loss: -0.7507273649370062]\n" ] } ], "source": [ "from lale.lib.lale import Hyperopt\n", "trained_pipeline = planned_pipeline.auto_configure(\n", " train_X, train_y, Hyperopt, cv=3, max_evals=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Displaying Automation Results" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy 72.1%\n" ] } ], "source": [ "from sklearn.metrics import accuracy_score\n", "predictions = trained_pipeline.predict(test_X)\n", "print(f'accuracy {accuracy_score(test_y, predictions):.1%}')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "project_0\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "norm\n", "\n", "\n", "Norm\n", "\n", "\n", "\n", "\n", "project_0->norm\n", "\n", "\n", "\n", "\n", "concat_features\n", "\n", "\n", "Concat-\n", "Features\n", "\n", "\n", "\n", "\n", "norm->concat_features\n", "\n", "\n", "\n", "\n", "project_1\n", "\n", "\n", "Project\n", "\n", "\n", "\n", "\n", "one_hot\n", "\n", "\n", "One-\n", "Hot\n", "\n", "\n", "\n", "\n", "project_1->one_hot\n", "\n", "\n", "\n", "\n", "one_hot->concat_features\n", "\n", "\n", "\n", "\n", "linear_svc\n", "\n", "\n", "Linear-\n", "SVC\n", "\n", "\n", "\n", "\n", "concat_features->linear_svc\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trained_pipeline.visualize()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "```python\n", "project_0 = Project(columns={'type': 'number'})\n", "norm = Norm(norm='l1')\n", "project_1 = Project(columns={'type': 'string'})\n", "linear_svc = LinearSVC(dual=False, C=9773.459065896624, tol=0.0006905227182226334)\n", "pipeline = ((project_0 >> norm) & (project_1 >> OneHot())) >> ConcatFeatures() >> linear_svc\n", "```" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trained_pipeline.pretty_print(ipython_display=True, show_imports=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Call for Users and Contributors\n", "\n", "- Repository: https://github.com/IBM/lale\n", "\n", "- [Guide for scikit-learn users](https://nbviewer.jupyter.org/github/IBM/lale/blob/master/examples/docs_guide_for_sklearn_users.ipynb)\n", "\n", "- [How to add new operators](https://nbviewer.jupyter.org/github/IBM/lale/blob/master/examples/docs_new_operators.ipynb)\n", "\n", "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Grammar Example" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", "from sklearn.preprocessing import PolynomialFeatures as PolyFeat\n", "wrap_imported_operators()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "cluster:choice_0\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:pipeline_1\n", "\n", "\n", "\n", "\n", "\n", "cluster:choice_1\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:pipeline_2\n", "\n", "\n", "\n", "\n", "\n", "cluster:choice_2\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_3\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_4\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_5\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "no_op_0\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "no_op_1\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "no_op_2\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "pca_0\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "no_op_2->pca_0\n", "\n", "\n", "\n", "\n", "pca_1\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "pca_0->pca_1\n", "\n", "\n", "\n", "\n", "norm_0\n", "\n", "\n", "Norm\n", "\n", "\n", "\n", "\n", "poly_feat_0\n", "\n", "\n", "Poly-\n", "Feat\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "pca_1->lr\n", "\n", "\n", "\n", "\n", "norm_1\n", "\n", "\n", "Norm\n", "\n", "\n", "\n", "\n", "poly_feat_1\n", "\n", "\n", "Poly-\n", "Feat\n", "\n", "\n", "\n", "\n", "linear_svc\n", "\n", "\n", "Linear-\n", "SVC\n", "\n", "\n", "\n", "\n", "xg_boost\n", "\n", "\n", "XG-\n", "Boost\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from lale.grammar import Grammar\n", "\n", "g = Grammar()\n", "\n", "g.start = make_pipeline(g.rec_tfms, g.prim_est)\n", "g.rec_tfms = NoOp | make_pipeline(g.rec_tfms, g.prim_tfm)\n", "g.prim_tfm = PCA | Norm | PolyFeat\n", "g.prim_est = LR | LinearSVC(dual=False) | XGBoost\n", "\n", "unrolled = g.unfold(3)\n", "unrolled.visualize()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "```python\n", "linear_svc = LinearSVC(dual=False)\n", "pipeline = (NoOp | (NoOp | (NoOp) >> (PCA | Norm | PolyFeat)) >> (PCA | Norm | PolyFeat)) >> (LR | linear_svc | XGBoost)\n", "```" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "unrolled.pretty_print(ipython_display=True, show_imports=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }