{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  Exponentiated Gradient Reduction\n",
    "\n",
    "Exponentiated gradient reduction is an in-processing technique that reduces fair classification to a sequence of cost-sensitive classification problems, returning a randomized classifier with the lowest empirical error subject to \n",
    "fair classification constraints. The code for exponentiated gradient reduction wraps the source class \n",
    "`fairlearn.reductions.ExponentiatedGradient` available in the https://github.com/fairlearn/fairlearn library,\n",
    "licensed under the MIT Licencse, Copyright Microsoft Corporation.\n",
    "\n",
    "This version of exponentiated gradient reduction (implemented in `aif360.algorithms`) wraps the sklearn compatible version of exponentiated gradient reduction implemented in `aif360.sklearn`. For a detailed tutorial on sklearn compatible exponentiated gradient reduction see [examples/sklearn/demo_exponentiated_gradient_reduction_sklearn.ipynb](sklearn/demo_exponentiated_gradient_reduction_sklearn.ipynb). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\", category=FutureWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load all necessary packages\n",
    "from aif360.metrics import BinaryLabelDatasetMetric\n",
    "from aif360.metrics import ClassificationMetric\n",
    "\n",
    "from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_adult\n",
    "\n",
    "from aif360.algorithms.inprocessing.exponentiated_gradient_reduction import ExponentiatedGradientReduction\n",
    "\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.preprocessing import MaxAbsScaler\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "from IPython.display import Markdown, display\n",
    "\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Load dataset and set options"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the dataset and split into train and test\n",
    "dataset_orig = load_preproc_data_adult()\n",
    "\n",
    "privileged_groups = [{'sex': 1}]\n",
    "unprivileged_groups = [{'sex': 0}]\n",
    "\n",
    "np.random.seed(0)\n",
    "dataset_orig_train, dataset_orig_test = dataset_orig.split([0.7], shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "#### Training Dataset shape"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(34189, 18)\n"
     ]
    },
    {
     "data": {
      "text/markdown": [
       "#### Favorable and unfavorable labels"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.0 0.0\n"
     ]
    },
    {
     "data": {
      "text/markdown": [
       "#### Protected attribute names"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['sex', 'race']\n"
     ]
    },
    {
     "data": {
      "text/markdown": [
       "#### Privileged and unprivileged protected attribute values"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[array([1.]), array([1.])] [array([0.]), array([0.])]\n"
     ]
    },
    {
     "data": {
      "text/markdown": [
       "#### Dataset feature names"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['race', 'sex', 'Age (decade)=10', 'Age (decade)=20', 'Age (decade)=30', 'Age (decade)=40', 'Age (decade)=50', 'Age (decade)=60', 'Age (decade)=>=70', 'Education Years=6', 'Education Years=7', 'Education Years=8', 'Education Years=9', 'Education Years=10', 'Education Years=11', 'Education Years=12', 'Education Years=<6', 'Education Years=>12']\n"
     ]
    }
   ],
   "source": [
    "# print out some labels, names, etc.\n",
    "display(Markdown(\"#### Training Dataset shape\"))\n",
    "print(dataset_orig_train.features.shape)\n",
    "display(Markdown(\"#### Favorable and unfavorable labels\"))\n",
    "print(dataset_orig_train.favorable_label, dataset_orig_train.unfavorable_label)\n",
    "display(Markdown(\"#### Protected attribute names\"))\n",
    "print(dataset_orig_train.protected_attribute_names)\n",
    "display(Markdown(\"#### Privileged and unprivileged protected attribute values\"))\n",
    "print(dataset_orig_train.privileged_protected_attributes, \n",
    "      dataset_orig_train.unprivileged_protected_attributes)\n",
    "display(Markdown(\"#### Dataset feature names\"))\n",
    "print(dataset_orig_train.feature_names)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Metric for original training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "#### Original training dataset"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train set: Difference in mean outcomes between unprivileged and privileged groups = -0.193075\n",
      "Test set: Difference in mean outcomes between unprivileged and privileged groups = -0.198048\n"
     ]
    }
   ],
   "source": [
    "# Metric for the original dataset\n",
    "metric_orig_train = BinaryLabelDatasetMetric(dataset_orig_train, \n",
    "                                             unprivileged_groups=unprivileged_groups,\n",
    "                                             privileged_groups=privileged_groups)\n",
    "display(Markdown(\"#### Original training dataset\"))\n",
    "print(\"Train set: Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_orig_train.mean_difference())\n",
    "metric_orig_test = BinaryLabelDatasetMetric(dataset_orig_test, \n",
    "                                             unprivileged_groups=unprivileged_groups,\n",
    "                                             privileged_groups=privileged_groups)\n",
    "print(\"Test set: Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_orig_test.mean_difference())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "#### Scaled dataset - Verify that the scaling does not affect the group label statistics"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train set: Difference in mean outcomes between unprivileged and privileged groups = -0.193075\n",
      "Test set: Difference in mean outcomes between unprivileged and privileged groups = -0.198048\n"
     ]
    }
   ],
   "source": [
    "min_max_scaler = MaxAbsScaler()\n",
    "dataset_orig_train.features = min_max_scaler.fit_transform(dataset_orig_train.features)\n",
    "dataset_orig_test.features = min_max_scaler.transform(dataset_orig_test.features)\n",
    "metric_scaled_train = BinaryLabelDatasetMetric(dataset_orig_train, \n",
    "                             unprivileged_groups=unprivileged_groups,\n",
    "                             privileged_groups=privileged_groups)\n",
    "display(Markdown(\"#### Scaled dataset - Verify that the scaling does not affect the group label statistics\"))\n",
    "print(\"Train set: Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_scaled_train.mean_difference())\n",
    "metric_scaled_test = BinaryLabelDatasetMetric(dataset_orig_test, \n",
    "                             unprivileged_groups=unprivileged_groups,\n",
    "                             privileged_groups=privileged_groups)\n",
    "print(\"Test set: Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_scaled_test.mean_difference())\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Standard Logistic Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "#### Accuracy"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8042039172865625\n"
     ]
    }
   ],
   "source": [
    "X_train = dataset_orig_train.features\n",
    "y_train = dataset_orig_train.labels.ravel()\n",
    "\n",
    "lmod = LogisticRegression(solver='lbfgs')\n",
    "lmod.fit(X_train, y_train, sample_weight=dataset_orig_train.instance_weights)\n",
    "\n",
    "X_test = dataset_orig_test.features\n",
    "y_test = dataset_orig_test.labels.ravel()\n",
    "\n",
    "y_pred = lmod.predict(X_test)\n",
    "\n",
    "display(Markdown(\"#### Accuracy\"))\n",
    "lr_acc = accuracy_score(y_test, y_pred)\n",
    "print(lr_acc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "#### Average odds difference"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-0.27273605621431707\n"
     ]
    }
   ],
   "source": [
    "dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)\n",
    "dataset_orig_test_pred.labels = y_pred\n",
    "\n",
    "# positive class index\n",
    "pos_ind = np.where(lmod.classes_ == dataset_orig_train.favorable_label)[0][0]\n",
    "dataset_orig_test_pred.scores = lmod.predict_proba(X_test)[:,pos_ind].reshape(-1,1)\n",
    "\n",
    "metric_test = ClassificationMetric(dataset_orig_test, \n",
    "                                    dataset_orig_test_pred,\n",
    "                                    unprivileged_groups=unprivileged_groups,\n",
    "                                    privileged_groups=privileged_groups)\n",
    "display(Markdown(\"#### Average odds difference\"))\n",
    "lr_aod = metric_test.average_odds_difference()\n",
    "print(lr_aod)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exponentiated Gradient Reduction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Choose a base model for the randomized classifer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator = LogisticRegression(solver='lbfgs', max_iter=1000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train the randomized classifier and observe test accuracy. Other options for `constraints` include \"DemographicParity\", \"TruePositiveRateParity\", \"FalsePositiveRateParity\", and \"ErrorRateRatio\"."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n"
     ]
    }
   ],
   "source": [
    "np.random.seed(0) #need for reproducibility\n",
    "exp_grad_red = ExponentiatedGradientReduction(estimator=estimator, \n",
    "                                              constraints=\"EqualizedOdds\",\n",
    "                                              drop_prot_attr=False)\n",
    "exp_grad_red.fit(dataset_orig_train)\n",
    "exp_grad_red_pred = exp_grad_red.predict(dataset_orig_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "#### Accuracy"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7865283559680611\n"
     ]
    },
    {
     "data": {
      "text/markdown": [
       "#### Average odds difference"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.011012958938905922\n"
     ]
    }
   ],
   "source": [
    "metric_test = ClassificationMetric(dataset_orig_test, \n",
    "                                   exp_grad_red_pred,\n",
    "                                    unprivileged_groups=unprivileged_groups,\n",
    "                                    privileged_groups=privileged_groups)\n",
    "\n",
    "display(Markdown(\"#### Accuracy\"))\n",
    "egr_acc = metric_test.accuracy()\n",
    "print(egr_acc)\n",
    "\n",
    "#Check if accuracy is comparable\n",
    "assert abs(lr_acc-egr_acc)<0.03\n",
    "\n",
    "display(Markdown(\"#### Average odds difference\"))\n",
    "egr_aod = metric_test.average_odds_difference()\n",
    "print(egr_aod)\n",
    "\n",
    "#Check if average odds difference has improved\n",
    "assert abs(egr_aod)<abs(lr_aod)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.7 ('aif360')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "vscode": {
   "interpreter": {
    "hash": "d0c5ced7753e77a483fec8ff7063075635521cce6e0bd54998c8f174742209dd"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}