# Census income classification with Keras

from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense, Flatten, Concatenate, concatenate, Dropout, Lambda
from keras.models import Model
from keras.layers.embeddings import Embedding
from tqdm import tqdm
import shap

# print the JS visualization code to the notebook
shap.initjs()

## Load dataset

import pandas as pd

df = pd.read_csv('../input/adult.csv')

df.head()

df.dtypes

X_display = df.drop('income',axis=1)
y_display = df['income']

int_columns = df.select_dtypes(['int64']).columns
df[int_columns] = df[int_columns].astype('float32')

cat_columns = df.select_dtypes(['object']).columns
df[cat_columns] = df[cat_columns].astype('category')
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

X = df.drop('income',axis=1)
y = df['income']

#X,y = shap.datasets.adult()
#X_display,y_display = shap.datasets.adult(display=True)

# normalize data (this is important for model convergence)
dtypes = list(zip(X.dtypes.index, map(str, X.dtypes)))
for k,dtype in dtypes:
    if dtype == "float32":
        X[k] -= X[k].mean()
        X[k] /= X[k].std()

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=7)

X.head()

## Train Keras model

# build model
input_els = []
encoded_els = []
for k,dtype in dtypes:
    input_els.append(Input(shape=(1,)))
    if dtype == "int8":
        e = Flatten()(Embedding(X_train[k].max()+1, 1)(input_els[-1]))
    else:
        e = input_els[-1]
    encoded_els.append(e)
encoded_els = concatenate(encoded_els)
layer1 = Dropout(0.5)(Dense(100, activation="relu")(encoded_els))
out = Dense(1)(layer1)

# train model
regression = Model(inputs=input_els, outputs=[out])
regression.compile(optimizer="adam", loss='binary_crossentropy')
regression.fit(
    [X_train[k].values for k,t in dtypes],
    y_train,
    epochs=50,
    batch_size=512,
    shuffle=True,
    validation_data=([X_valid[k].values for k,t in dtypes], y_valid)
)

## Explain predictions

Here we take the Keras model trained above and explain why it makes different predictions for different individuals. SHAP expects model functions to take a 2D numpy array as input, so we define a wrapper function around the original Keras predict function." }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "e34ed91bdcff6a302d6bd63e25865562b5312c3b" }, "cell_type": "code", "source": "def f(X):\n return regression.predict([X[:,i] for i in range(X.shape[1])]).flatten()", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "294a0b684b28ac377ea5149261e304a66653c91b" }, "cell_type": "markdown", "source": "### Explain a single prediction\n\nHere we use a selection of 50 samples from the dataset to represent \"typical\" feature values, and then use 500 perterbation samples to estimate the SHAP values for a given prediction. Note that this requires 500 * 50 evaluations of the model." }, { "metadata": { "trusted": true, "_uuid": "9a064f9e15039c5701e70c91a6d660338d1dd037", "collapsed": true }, "cell_type": "code", "source": "explainer = shap.KernelExplainer(f, X.iloc[:100,:])\nshap_values = explainer.shap_values(X.iloc[350,:], nsamples=500)\nshap.force_plot(shap_values, X_display.iloc[350,:])", "execution_count": null, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "1fda8fb62140b0cf9364db4effbe65553c2450ea", "collapsed": true }, "cell_type": "code", "source": "shap_values = explainer.shap_values(X.iloc[167,:], nsamples=500)\nshap.force_plot(shap_values, X_display.iloc[167,:])", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "655321b925d8c4f728552145a026d4b380c3b88a" }, "cell_type": "markdown", "source": "### Explain many predictions\n\nHere we repeat the above explanation process for 50 individuals. shap_values = explainer.shap_values(X.iloc[100:330,:], nsamples=500)

shap.force_plot(shap_values, X_display.iloc[100:330,:])

shap.summary_plot(shap_values50, X.iloc[100:330,:])

shap.dependence_plot("marital.status", 
                     shap_values, 
                     X.iloc[100:330,:], 
                     display_features=X_display.iloc[100:330,:])