{ "cells": [ { "metadata": { "trusted": true, "_uuid": "1713f4306d45835efceefad592e4e6b046302f23", "collapsed": true }, "cell_type": "code", "source": "import pandas as pd\nimport numpy as np\nnp.random.seed(42)\n\nimport matplotlib.pyplot as plt\n%matplotlib inline\n\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.metrics import accuracy_score, roc_auc_score\nfrom sklearn.utils.class_weight import compute_class_weight\n\n\nfrom keras.layers import Input, Dense, Dropout\nfrom keras.models import Model", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "e1b433fb1c8346a04cff7d90f162a90afc6a97e9", "collapsed": true }, "cell_type": "code", "source": "!ls ../input", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "67ca0a2af21dfcd70a0d6ea024ae1417c0db0097" }, "cell_type": "code", "source": "path = '../input/adult.csv'\ninput_data = pd.read_csv(path, na_values=\"?\")", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "c2cdce6626547f1f755782cd426c777173f564ac", "collapsed": true }, "cell_type": "code", "source": "input_data.head()", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "6d81a92b7651715cabfed67491e587863c1d8698" }, "cell_type": "code", "source": "input_data = input_data[input_data['race'].isin(['White', 'Black'])]", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "90028dc30c91810dfb793280dbc3b89930303e7c", "collapsed": true }, "cell_type": "code", "source": "input_data.head()", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "5092dbcf31d13e4b76411bbece6c42e164edd348", "collapsed": true }, "cell_type": "code", "source": "# sensitive attributes; we identify 'race' and 'sex' as sensitive attributes\nsensitive_attribs = ['race', 'gender']\nA = input_data[sensitive_attribs]\nA = pd.get_dummies(A,drop_first=True)\nA.columns = sensitive_attribs", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "18e6ae5dda563f17ff3ac647f55f0d990e42e4c7", "collapsed": true }, "cell_type": "code", "source": "A.head()", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "2f3877d05de984830b75cc4e0006feec6ca00f8e", "collapsed": true }, "cell_type": "code", "source": "y = (input_data['income'] == '>50K').astype(int)", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "916460c4b276af0d12a496d89ef2ff61f5ad96ed" }, "cell_type": "code", "source": "X = input_data.drop(labels=['income', 'race', 'gender'],axis=1)\n\nX = X.fillna('Unknown')\n\nX = pd.get_dummies(X, drop_first=True)", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "1c7a4316a7f23f885620a7ca456443c3cf4fcae0" }, "cell_type": "code", "source": "# split into train/test set\nX_train, X_test, y_train, y_test, A_train, A_test = train_test_split(X, y, A, test_size=0.5, \n stratify=y, random_state=7)\n\n# standardize the data\nscaler = StandardScaler().fit(X_train)\n#scale_df = lambda df, scaler: pd.DataFrame(scaler.transform(df), columns=df.columns, index=df.index)\nX_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)\nX_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "33d6dd864dd78f81025e9e836cbb57b2c2b02643" }, "cell_type": "code", "source": "def p_rule(y_pred, a_values, threshold=0.5):\n y_a_1 = y_pred[a_values == 1] > threshold if threshold else y_pred[a_values == 1]\n y_a_0 = y_pred[a_values == 0] > threshold if threshold else y_pred[a_values == 0]\n odds = y_a_1.mean() / y_a_0.mean()\n return np.min([odds, 1/odds]) * 100", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "552af7ac22b3f3f52424bc183d15d8d89cfdae18" }, "cell_type": "code", "source": "def make_trainable_fn(net): # Produces a function that makes a network trainable or not\n def make_trainable(flag): # Loop over layers and set their trainability\n net.trainable = flag\n for layer in net.layers:\n layer.trainable = flag\n return make_trainable", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "da96fef499154bed2d2454b12e8350362e5bf65d" }, "cell_type": "code", "source": "def compute_class_weights(data_set):\n class_values = [0, 1]\n class_weights = []\n if len(data_set.shape) == 1:\n balanced_weights = compute_class_weight('balanced', class_values, data_set)\n class_weights.append(dict(zip(class_values, balanced_weights)))\n else:\n n_attr = data_set.shape[1]\n for attr_idx in range(n_attr):\n balanced_weights = compute_class_weight('balanced', class_values,\n np.array(data_set)[:,attr_idx])\n class_weights.append(dict(zip(class_values, balanced_weights)))\n return class_weights", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "1deddf7765ea1ccb094ad147832c35e8977cfcb1" }, "cell_type": "code", "source": "def compute_target_class_weights(y):\n class_values = [0,1]\n balanced_weights = compute_class_weight('balanced', class_values, y)\n class_weights = {'y': dict(zip(class_values, balanced_weights))}\n return class_weights", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "98e90f1c8f2b380bd8ef2fcae2434fb33c22e1b8" }, "cell_type": "code", "source": "n_features=X_train.shape[1]\nn_sensitive=A_train.shape[1]\nlambdas=[130., 30.]\n", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "14d4bfe6a04f952fad4a3f0b00f0caf105181d4b" }, "cell_type": "code", "source": "clf_inputs = Input(shape=(n_features,)) # Classifier input = All features\n\n############### Create CLF net ########################\nx = Dense(32, activation='relu')(clf_inputs)\nx = Dropout(0.2)(x)\nx = Dense(32, activation='relu')(x)\nx = Dropout(0.2)(x)\nx = Dense(32, activation='relu')(x)\nx = Dropout(0.2)(x)\noutputs = Dense(1, activation='sigmoid', name='y')(x)\nclf_net = Model(inputs=[clf_inputs], outputs=[outputs])\n#######################################################", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "2e85c13eac2c339276e14770b17caba341bca582", "collapsed": true }, "cell_type": "code", "source": "adv_inputs = Input(shape=(1,)) # Adversary input = Classifier output (one number)\n\n############## Create ADV net #########################\nx = Dense(32, activation='relu')(adv_inputs)\nx = Dense(32, activation='relu')(x)\nx = Dense(32, activation='relu')(x)\noutputs = [Dense(1, activation='sigmoid')(x) for _ in range(n_sensitive)]\nadv_net = Model(inputs=[adv_inputs], outputs=outputs)\n#######################################################", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "ffc19e462ce48456c9e698c4a564f8473489093c" }, "cell_type": "code", "source": "############## Create train switches #################\ntrainable_clf_net = make_trainable_fn(clf_net) # Get function to make classifier trainable\n\ntrainable_adv_net = make_trainable_fn(adv_net) # Function to make adversary trainable\n\n######################################################", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "0ba4fdfbfb8233f90affbd4495156a4b4aeca0fc" }, "cell_type": "code", "source": "#clf = compile_clf(clf_net) # Compile classifier\n\nclf = clf_net\ntrainable_clf_net(True)\nclf.compile(loss='binary_crossentropy', optimizer='adam')\n", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "397117080a03743e5eba52b928a5b7db8585d301" }, "cell_type": "code", "source": "# Creates a classifier adversary super net\nadv_out = adv_net(clf_net(clf_inputs))\nclf_out = clf_net(clf_inputs)\nclf_w_adv = Model(inputs=[clf_inputs], outputs=[clf_out]+adv_out)\n\n# The adversary is not trainable the classifier is\ntrainable_clf_net(True)\ntrainable_adv_net(False)\n# Create a weighted loss for all sensitive variables\nloss_weights = [1.]+[-lambda_param for lambda_param in lambdas]\n# Compile super net\nclf_w_adv.compile(loss='binary_crossentropy', \n loss_weights=loss_weights,\n optimizer='adam')", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "568e536f4b0d54022b8022f6579180def3d598d7" }, "cell_type": "code", "source": "# Compile adversary with the classifier as inputs\nadv = Model(inputs=[clf_inputs], outputs=adv_net(clf_net(clf_inputs)))\n# Classifier is not trainable, adversary is\ntrainable_clf_net(False)\ntrainable_adv_net(True)\nadv.compile(loss='binary_crossentropy', optimizer='adam')", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "37bd746cdb7ddedc52948517c091b2a6ab378e27", "collapsed": true }, "cell_type": "code", "source": "trainable_clf_net(True)\nclf.fit(X_train.values, y_train.values, epochs=10)", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "6b9d0368986990063acd2b4b36d239ce9583bc55", "collapsed": true }, "cell_type": "code", "source": "trainable_clf_net(False)\ntrainable_adv_net(True)\nclass_weight_adv = compute_class_weights(A_train)\nadv.fit(X_train.values, np.hsplit(A_train.values, A_train.shape[1]), class_weight=class_weight_adv,epochs=10)", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "8297c4ceb5334aac278aed6260c4f508581a7f61" }, "cell_type": "code", "source": "y_pred = clf.predict(X_test)", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "91c96a5c1bd240a153270ab162c0df02d86192c3", "collapsed": true }, "cell_type": "code", "source": "for sens in A_test.columns:\n pr = p_rule(y_pred,A_test[sens])\n print(sens,pr)", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "d9c86e63a56fe33fa5759239782d1802cd7cd26e", "collapsed": true }, "cell_type": "code", "source": "acc = accuracy_score(y_test,(y_pred>0.5))* 100\nprint('Clf acc: {:.2f}'.format(acc))", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "488a8dc18376fa0ff02fa6af5a837611811875a0", "collapsed": true }, "cell_type": "code", "source": "n_iter=250\nbatch_size=128\nn_sensitive = A_train.shape[1]\n\nclass_weight_clf_w_adv = [{0:1., 1:1.}]+class_weight_adv\n\nval_metrics = pd.DataFrame()\n\nfairness_metrics = pd.DataFrame()\n\nfor idx in range(n_iter): # Train for n epochs\n\n # train adverserial\n trainable_clf_net(False)\n trainable_adv_net(True)\n adv.fit(X_train.values, \n np.hsplit(A_train.values, A_train.shape[1]), \n batch_size=batch_size, \n class_weight=class_weight_adv, \n epochs=1, verbose=0)\n\n\n # train classifier\n # Make classifier trainable and adversery untrainable\n trainable_clf_net(True)\n trainable_adv_net(False)\n # Sample batch\n indices = np.random.permutation(len(X_train))[:batch_size]\n # Train on batch\n clf_w_adv.train_on_batch(X_train.values[indices], \n [y_train.values[indices]]+np.hsplit(A_train.values[indices], n_sensitive),\n class_weight=class_weight_clf_w_adv)\n\n \n # Make validation data predictions\n y_pred = pd.Series(clf.predict(X_test).ravel(), index=y_test.index)\n\n roc_auc = roc_auc_score(y_test, y_pred)\n acc = accuracy_score(y_test, (y_pred>0.5))*100\n # Calculate ROC and accuracy\n val_metrics.loc[idx, 'ROC AUC'] = roc_auc\n val_metrics.loc[idx, 'Accuracy'] = acc\n\n # Calculate p rule\n for sensitive_attr in A_test.columns:\n fairness_metrics.loc[idx, sensitive_attr] = p_rule(y_pred,A_test[sensitive_attr])\n\n print('Epoch: {}, Accuracy: {:.2f}, Race P: {:.2f}, Gender P: {:.2f}'.format(idx,\n acc, \n fairness_metrics.loc[idx, 'race'],\n fairness_metrics.loc[idx, 'gender']))\n\n", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "17e86024acf92095929a36df1f02f84e88c5132d", "collapsed": true }, "cell_type": "code", "source": "# adverserial train on train set and validate on test set\n#vm, fm = fit(X_train, y_train, A_train,validation_data=(X_test, y_test, A_test),n_iter=200)", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "d43f2519b2c87f6f12669cc5c6005811836a5724", "collapsed": true }, "cell_type": "code", "source": "plt.figure(figsize=(10,7))\nplt.xlabel('Epochs')\nplt.plot(val_metrics['Accuracy'],label='Accuracy')\nplt.plot(val_metrics['ROC AUC']*100,label='ROC AUC')\nplt.plot(fairness_metrics['race'],label='Race')\nplt.plot(fairness_metrics['gender'],label='Gender')\nplt.legend()", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "ac795a99159aa2fb93c229a165012b718b45b857" }, "cell_type": "code", "source": "", "execution_count": null, "outputs": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.6.5", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" } }, "nbformat": 4, "nbformat_minor": 1 }