{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: liac-arff>=2.4.0 in /home/hirzel/python3.6venv/lib/python3.6/site-packages (2.4.0)\r\n" ] } ], "source": [ "!pip install 'liac-arff>=2.4.0'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
classchecking_statusdurationcredit_historypurposecredit_amountsavings_statusemploymentinstallment_commitmentpersonal_status...residence_sinceproperty_magnitudeageother_payment_planshousingexisting_creditsjobnum_dependentsown_telephoneforeign_worker
835bad<012.0no credits/all paidnew car1082.0<1001<=X<44.0male single...4.0car48.0bankown2.0skilled1.0noneyes
192bad0<=X<20027.0existing paidbusiness3915.0<1001<=X<44.0male single...2.0car36.0noneown1.0skilled2.0yesyes
629goodno checking9.0existing paideducation3832.0no known savings>=71.0male single...4.0real estate64.0noneown1.0unskilled resident1.0noneyes
559bad0<=X<20018.0critical/other existing creditfurniture/equipment1928.0<100<12.0male single...2.0real estate31.0noneown2.0unskilled resident1.0noneyes
684good0<=X<20036.0delayed previouslybusiness9857.0100<=X<5004<=X<71.0male single...3.0life insurance31.0noneown2.0unskilled resident2.0yesyes
\n", "

5 rows × 21 columns

\n", "
" ], "text/plain": [ " class checking_status duration credit_history \\\n", "835 bad <0 12.0 no credits/all paid \n", "192 bad 0<=X<200 27.0 existing paid \n", "629 good no checking 9.0 existing paid \n", "559 bad 0<=X<200 18.0 critical/other existing credit \n", "684 good 0<=X<200 36.0 delayed previously \n", "\n", " purpose credit_amount savings_status employment \\\n", "835 new car 1082.0 <100 1<=X<4 \n", "192 business 3915.0 <100 1<=X<4 \n", "629 education 3832.0 no known savings >=7 \n", "559 furniture/equipment 1928.0 <100 <1 \n", "684 business 9857.0 100<=X<500 4<=X<7 \n", "\n", " installment_commitment personal_status ... residence_since \\\n", "835 4.0 male single ... 4.0 \n", "192 4.0 male single ... 2.0 \n", "629 1.0 male single ... 4.0 \n", "559 2.0 male single ... 2.0 \n", "684 1.0 male single ... 3.0 \n", "\n", " property_magnitude age other_payment_plans housing existing_credits \\\n", "835 car 48.0 bank own 2.0 \n", "192 car 36.0 none own 1.0 \n", "629 real estate 64.0 none own 1.0 \n", "559 real estate 31.0 none own 2.0 \n", "684 life insurance 31.0 none own 2.0 \n", "\n", " job num_dependents own_telephone foreign_worker \n", "835 skilled 1.0 none yes \n", "192 skilled 2.0 yes yes \n", "629 unskilled resident 1.0 none yes \n", "559 unskilled resident 1.0 none yes \n", "684 unskilled resident 2.0 yes yes \n", "\n", "[5 rows x 21 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import lale.datasets.openml\n", "import pandas as pd\n", "(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(\n", " 'credit-g', 'classification', preprocess=False)\n", "pd.concat([train_y.tail(), train_X.tail()], axis=1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import Normalizer as Norm\n", "from lale.lib.lale import NoOp\n", "from sklearn.preprocessing import OneHotEncoder as OneHot\n", "from sklearn.linear_model import LogisticRegression as LR\n", "from xgboost import XGBClassifier as XGBoost\n", "from sklearn.svm import LinearSVC\n", "from sklearn.compose import ColumnTransformer\n", "from lale.operators import make_pipeline\n", "lale.wrap_imported_operators()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "num_cols = [col for col in train_X.columns\n", " if np.issubdtype(train_X.dtypes[col], np.number)]\n", "cat_cols = [col for col in train_X.columns if col not in num_cols]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "cluster:column_transformer\n", "\n", "\n", "ColumnTransformer\n", "\n", "\n", "\n", "\n", "norm\n", "\n", "\n", "Norm\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "norm->lr\n", "\n", "\n", "\n", "\n", "one_hot\n", "\n", "\n", "One-\n", "Hot\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pipeline_trainable = ColumnTransformer(\n", " transformers=[\n", " ('num_tfm', Norm(norm='l1'), num_cols),\n", " ('cat_tfm', OneHot(), cat_cols)]) >> LR()\n", "pipeline_trainable.visualize()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1 s, sys: 188 ms, total: 1.19 s\n", "Wall time: 1.06 s\n" ] } ], "source": [ "%%time\n", "pipeline_trained = pipeline_trainable.fit(train_X, train_y)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy 71.2%\n" ] } ], "source": [ "import sklearn.metrics\n", "predictions = pipeline_trained.predict(test_X)\n", "print(f'accuracy {sklearn.metrics.accuracy_score(test_y, predictions):.1%}')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "cluster:column_transformer\n", "\n", "\n", "ColumnTransformer\n", "\n", "\n", "\n", "cluster:choice\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_0\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "norm\n", "\n", "\n", "Norm\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "norm->lr\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "one_hot\n", "\n", "\n", "One-\n", "Hot\n", "\n", "\n", "\n", "\n", "linear_svc\n", "\n", "\n", "Linear-\n", "SVC\n", "\n", "\n", "\n", "\n", "xg_boost\n", "\n", "\n", "XG-\n", "Boost\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pipeline_planned = make_pipeline(\n", " ColumnTransformer(transformers=[\n", " ('num_tfm', Norm | NoOp, num_cols),\n", " ('cat_tfm', OneHot, cat_cols)]),\n", " LR | LinearSVC(dual=False)| XGBoost)\n", "pipeline_planned.visualize()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|█████████| 5/5 [00:52<00:00, 10.58s/trial, best loss: -0.7507273649370062]\n", "CPU times: user 55.7 s, sys: 3.28 s, total: 59 s\n", "Wall time: 54.6 s\n" ] } ], "source": [ "%%time\n", "from lale.lib.lale import Hyperopt\n", "pipeline_trained = pipeline_planned.auto_configure(\n", " train_X, train_y, Hyperopt, cv=3, max_evals=5)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "cluster:column_transformer\n", "\n", "\n", "ColumnTransformer\n", "\n", "\n", "\n", "\n", "norm\n", "\n", "\n", "Norm\n", "\n", "\n", "\n", "\n", "linear_svc\n", "\n", "\n", "Linear-\n", "SVC\n", "\n", "\n", "\n", "\n", "norm->linear_svc\n", "\n", "\n", "\n", "\n", "one_hot\n", "\n", "\n", "One-\n", "Hot\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "```python\n", "norm = Norm(norm=\"l1\")\n", "column_transformer = ColumnTransformer(\n", " transformers=[\n", " (\n", " \"num_tfm\",\n", " norm,\n", " [\n", " \"duration\", \"credit_amount\", \"installment_commitment\",\n", " \"residence_since\", \"age\", \"existing_credits\", \"num_dependents\",\n", " ],\n", " ),\n", " (\n", " \"cat_tfm\",\n", " OneHot(),\n", " [\n", " \"checking_status\", \"credit_history\", \"purpose\",\n", " \"savings_status\", \"employment\", \"personal_status\",\n", " \"other_parties\", \"property_magnitude\", \"other_payment_plans\",\n", " \"housing\", \"job\", \"own_telephone\", \"foreign_worker\",\n", " ],\n", " ),\n", " ]\n", ")\n", "linear_svc = LinearSVC(\n", " dual=False,\n", " C=16757.615906506046,\n", " fit_intercept=False,\n", " tol=0.0006905134087360421,\n", ")\n", "pipeline = column_transformer >> linear_svc\n", "```" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pipeline_trained.visualize()\n", "pipeline_trained.pretty_print(ipython_display=True, show_imports=False)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy 72.1%\n" ] } ], "source": [ "predictions = pipeline_trained.predict(test_X)\n", "print(f'accuracy {sklearn.metrics.accuracy_score(test_y, predictions):.1%}')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }