{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:root:Missing Data: 3620 rows removed from AdultDataset.\n", "/home/hirzel/python3.6venv/lib/python3.6/site-packages/aif360/datasets/standard_dataset.py:121: FutureWarning: outer method for ufunc is not implemented on pandas objects. Returning an ndarray, but in the future this will raise a 'NotImplementedError'. Consider explicitly converting the Series to an array with '.array' first.\n", " priv = np.logical_or.reduce(np.equal.outer(vals, df[attr]))\n", "/home/hirzel/python3.6venv/lib/python3.6/site-packages/aif360/datasets/standard_dataset.py:142: FutureWarning: outer method for ufunc is not implemented on pandas objects. Returning an ndarray, but in the future this will raise a 'NotImplementedError'. Consider explicitly converting the Series to an array with '.array' first.\n", " df[label_name]))\n" ] }, { "data": { "text/plain": [ "{'favorable_label': 1.0,\n", " 'unfavorable_label': 0.0,\n", " 'protected_attribute_names': ['race', 'sex'],\n", " 'unprivileged_groups': [{'race': 0.0, 'sex': 0.0}],\n", " 'privileged_groups': [{'race': 1.0, 'sex': 1.0}]}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import aif360.datasets\n", "import lale.lib.aif360\n", "orig_ds = aif360.datasets.AdultDataset()\n", "fairness_info = lale.lib.aif360.dataset_fairness_info(orig_ds)\n", "fairness_info" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shapes: train_X (7913, 98), train_y (7913,), test_X (3392, 98), test_y (3392,)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
income-per-yearageeducation-numracesexcapital-gaincapital-losshours-per-weekworkclass=Federal-govworkclass=Local-gov...native-country=Portugalnative-country=Puerto-Riconative-country=Scotlandnative-country=Southnative-country=Taiwannative-country=Thailandnative-country=Trinadad&Tobagonative-country=United-Statesnative-country=Vietnamnative-country=Yugoslavia
01.028.014.01.01.00.00.045.00.00.0...0.00.00.00.00.00.00.00.00.00.0
10.060.05.01.01.00.00.045.00.00.0...0.00.00.00.00.00.00.01.00.00.0
20.043.010.01.01.00.01669.045.00.00.0...0.00.00.00.00.00.00.01.00.00.0
30.035.09.01.01.00.00.040.00.00.0...0.00.00.00.00.00.00.01.00.00.0
41.036.010.01.01.00.00.040.00.00.0...0.00.00.00.00.00.00.01.00.00.0
\n", "

5 rows × 99 columns

\n", "
" ], "text/plain": [ " income-per-year age education-num race sex capital-gain \\\n", "0 1.0 28.0 14.0 1.0 1.0 0.0 \n", "1 0.0 60.0 5.0 1.0 1.0 0.0 \n", "2 0.0 43.0 10.0 1.0 1.0 0.0 \n", "3 0.0 35.0 9.0 1.0 1.0 0.0 \n", "4 1.0 36.0 10.0 1.0 1.0 0.0 \n", "\n", " capital-loss hours-per-week workclass=Federal-gov workclass=Local-gov \\\n", "0 0.0 45.0 0.0 0.0 \n", "1 0.0 45.0 0.0 0.0 \n", "2 1669.0 45.0 0.0 0.0 \n", "3 0.0 40.0 0.0 0.0 \n", "4 0.0 40.0 0.0 0.0 \n", "\n", " ... native-country=Portugal native-country=Puerto-Rico \\\n", "0 ... 0.0 0.0 \n", "1 ... 0.0 0.0 \n", "2 ... 0.0 0.0 \n", "3 ... 0.0 0.0 \n", "4 ... 0.0 0.0 \n", "\n", " native-country=Scotland native-country=South native-country=Taiwan \\\n", "0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 \n", "\n", " native-country=Thailand native-country=Trinadad&Tobago \\\n", "0 0.0 0.0 \n", "1 0.0 0.0 \n", "2 0.0 0.0 \n", "3 0.0 0.0 \n", "4 0.0 0.0 \n", "\n", " native-country=United-States native-country=Vietnam \\\n", "0 0.0 0.0 \n", "1 1.0 0.0 \n", "2 1.0 0.0 \n", "3 1.0 0.0 \n", "4 1.0 0.0 \n", "\n", " native-country=Yugoslavia \n", "0 0.0 \n", "1 0.0 \n", "2 0.0 \n", "3 0.0 \n", "4 0.0 \n", "\n", "[5 rows x 99 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "from lale.lib.aif360 import dataset_to_pandas\n", "#using only 25% of the data so this notebook runs faster\n", "used_ds, ignore_ds = orig_ds.split([0.25], shuffle=True, seed=42)\n", "train_ds, test_ds = used_ds.split([0.7])\n", "train_X, train_y = dataset_to_pandas(train_ds)\n", "test_X, test_y = dataset_to_pandas(test_ds)\n", "print(f'shapes: train_X {train_X.shape}, train_y {train_y.shape}, test_X {test_X.shape}, test_y {test_y.shape}')\n", "pd.concat([train_y.head(), train_X.head()], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Search Initial Pipeline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", "from lale.lib.lale import NoOp\n", "from sklearn.linear_model import LogisticRegression as LR\n", "from xgboost import XGBClassifier as XGBoost\n", "import lale\n", "lale.wrap_imported_operators()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "cluster:choice_0\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "cluster:choice_1\n", "\n", "\n", "Choice\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "pca->lr\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "xg_boost\n", "\n", "\n", "XG-\n", "Boost\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "initial_planned = (PCA | NoOp) >> (LR | XGBoost)\n", "initial_planned.visualize()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:34<00:00, 14.73s/trial, best loss: -0.813349862426007]\n" ] } ], "source": [ "from lale.lib.lale import Hyperopt\n", "initial_trained = initial_planned.auto_configure(\n", " train_X, train_y, optimizer=Hyperopt, cv=3, max_evals=3)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "no_op->lr\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "```python\n", "lr = LR(C=1327.7214911678875, dual=True, tol=0.0856386428135445)\n", "pipeline = NoOp() >> lr\n", "```" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "initial_trained.visualize()\n", "initial_trained.pretty_print(ipython_display=True, show_imports=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluate Initial Pipeline" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy 83.1%\n" ] } ], "source": [ "#accuracy, ideal 100%, higher values indicate better predictive performance\n", "import sklearn.metrics\n", "accuracy_scorer = sklearn.metrics.make_scorer(sklearn.metrics.accuracy_score)\n", "print(f'accuracy {accuracy_scorer(initial_trained, test_X, test_y):.1%}')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "statistical parity difference -0.26\n" ] } ], "source": [ "#statistical parity difference, ideal 0, fair between -0.1 and +0.1\n", "stat_par_scorer = lale.lib.aif360.statistical_parity_difference(**fairness_info)\n", "print(f'statistical parity difference {stat_par_scorer(initial_trained, test_X, test_y):.2f}')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "disparate impact 0.25\n" ] } ], "source": [ "#disparate impact, ideal 1, fair between 0.8 and 1.2\n", "disparate_impact_scorer = lale.lib.aif360.disparate_impact(**fairness_info)\n", "print(f'disparate impact {disparate_impact_scorer(initial_trained, test_X, test_y):.2f}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Equalized Odds Postprocessing" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "cluster:(root)\n", "\n", "\n", "EqOddsPostprocessing\n", "\n", "\n", "\n", "cluster:pipeline\n", "\n", "\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "no_op->lr\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from lale.lib.aif360 import EqOddsPostprocessing\n", "eqodds_trainable = EqOddsPostprocessing(\n", " estimator=initial_trained, **fairness_info)\n", "eqodds_trainable.visualize()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.73 s, sys: 172 ms, total: 1.91 s\n", "Wall time: 2.21 s\n" ] } ], "source": [ "%%time\n", "eqodds_trained = eqodds_trainable.fit(train_X, train_y)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy 73.6%\n", "statistical parity difference 0.01\n", "disparate impact 1.02\n" ] } ], "source": [ "print(f'accuracy {accuracy_scorer(eqodds_trained, test_X, test_y):.1%}')\n", "print(f'statistical parity difference {stat_par_scorer(eqodds_trained, test_X, test_y):.2f}')\n", "print(f'disparate impact {disparate_impact_scorer(eqodds_trained, test_X, test_y):.2f}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Calibrated Equalized Odds Postprocessing" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "cluster:(root)\n", "\n", "\n", "CalibratedEqOddsPostprocessing\n", "\n", "\n", "\n", "cluster:pipeline\n", "\n", "\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "no_op->lr\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from lale.lib.aif360 import CalibratedEqOddsPostprocessing\n", "caleqo_trainable = CalibratedEqOddsPostprocessing(\n", " estimator=initial_trained, **fairness_info)\n", "caleqo_trainable.visualize()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.23 s, sys: 250 ms, total: 2.48 s\n", "Wall time: 2.61 s\n" ] } ], "source": [ "%%time\n", "caleqo_trained = caleqo_trainable.fit(train_X, train_y)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy 75.5%\n", "statistical parity difference 0.02\n", "disparate impact inf\n" ] } ], "source": [ "print(f'accuracy {accuracy_scorer(caleqo_trained, test_X, test_y):.1%}')\n", "print(f'statistical parity difference {stat_par_scorer(caleqo_trained, test_X, test_y):.2f}')\n", "print(f'disparate impact {disparate_impact_scorer(caleqo_trained, test_X, test_y):.2f}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reject Option Classification Postprocessing" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "cluster:(root)\n", "\n", "\n", "RejectOptionClassification\n", "\n", "\n", "\n", "cluster:pipeline\n", "\n", "\n", "\n", "\n", "\n", "\n", "no_op\n", "\n", "\n", "No-\n", "Op\n", "\n", "\n", "\n", "\n", "lr\n", "\n", "\n", "LR\n", "\n", "\n", "\n", "\n", "no_op->lr\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from lale.lib.aif360 import RejectOptionClassification\n", "rejopt_trainable = RejectOptionClassification(\n", " estimator=initial_trained, **fairness_info)\n", "rejopt_trainable.visualize()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 56.1 s, sys: 3.89 s, total: 1min\n", "Wall time: 1min 8s\n" ] } ], "source": [ "%%time\n", "rejopt_trained = rejopt_trainable.fit(train_X, train_y)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy 75.9%\n", "statistical parity difference 0.05\n", "disparate impact inf\n" ] } ], "source": [ "print(f'accuracy {accuracy_scorer(rejopt_trained, test_X, test_y):.1%}')\n", "print(f'statistical parity difference {stat_par_scorer(rejopt_trained, test_X, test_y):.2f}')\n", "print(f'disparate impact {disparate_impact_scorer(rejopt_trained, test_X, test_y):.2f}')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }