# Census income classification with Keras

To download a copy of this notebook visit [github](https://github.com/slundberg/shap/tree/master/notebooks).

In [None]:
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense, Flatten, Concatenate, concatenate, Dropout, Lambda
from keras.models import Model
from keras.layers.embeddings import Embedding
from tqdm import tqdm
import shap

# print the JS visualization code to the notebook
shap.initjs()

## Load dataset

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../input/adult.csv')

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
X_display = df.drop('income',axis=1)
y_display = df['income']

In [None]:
int_columns = df.select_dtypes(['int64']).columns
df[int_columns] = df[int_columns].astype('float32')

cat_columns = df.select_dtypes(['object']).columns
df[cat_columns] = df[cat_columns].astype('category')
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

In [None]:
X = df.drop('income',axis=1)
y = df['income']

In [None]:
#X,y = shap.datasets.adult()
#X_display,y_display = shap.datasets.adult(display=True)

# normalize data (this is important for model convergence)
dtypes = list(zip(X.dtypes.index, map(str, X.dtypes)))
for k,dtype in dtypes:
 if dtype == "float32":
 X[k] -= X[k].mean()
 X[k] /= X[k].std()

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=7)

In [None]:
X.head()

## Train Keras model

In [None]:
# build model
input_els = []
encoded_els = []
for k,dtype in dtypes:
 input_els.append(Input(shape=(1,)))
 if dtype == "int8":
 e = Flatten()(Embedding(X_train[k].max()+1, 1)(input_els[-1]))
 else:
 e = input_els[-1]
 encoded_els.append(e)
encoded_els = concatenate(encoded_els)
layer1 = Dropout(0.5)(Dense(100, activation="relu")(encoded_els))
out = Dense(1)(layer1)

# train model
regression = Model(inputs=input_els, outputs=[out])
regression.compile(optimizer="adam", loss='binary_crossentropy')
regression.fit(
 [X_train[k].values for k,t in dtypes],
 y_train,
 epochs=50,
 batch_size=512,
 shuffle=True,
 validation_data=([X_valid[k].values for k,t in dtypes], y_valid)
)

## Explain predictions

Here we take the Keras model trained above and explain why it makes different predictions for different individuals. SHAP expects model functions to take a 2D numpy array as input, so we define a wrapper function around the original Keras predict function.

In [None]:
def f(X):
 return regression.predict([X[:,i] for i in range(X.shape[1])]).flatten()

### Explain a single prediction

Here we use a selection of 50 samples from the dataset to represent "typical" feature values, and then use 500 perterbation samples to estimate the SHAP values for a given prediction. Note that this requires 500 * 50 evaluations of the model.

In [None]:
explainer = shap.KernelExplainer(f, X.iloc[:100,:])
shap_values = explainer.shap_values(X.iloc[350,:], nsamples=500)
shap.force_plot(shap_values, X_display.iloc[350,:])

In [None]:
shap_values = explainer.shap_values(X.iloc[167,:], nsamples=500)
shap.force_plot(shap_values, X_display.iloc[167,:])

### Explain many predictions

Here we repeat the above explanation process for 50 individuals. Since we are using a sampling based approximation each explanation can take a couple seconds depending on your machine setup.

In [None]:
shap_values = explainer.shap_values(X.iloc[100:330,:], nsamples=500)

In [None]:
shap.force_plot(shap_values, X_display.iloc[100:330,:])

In [None]:
shap.summary_plot(shap_values50, X.iloc[100:330,:])

In [None]:
shap.dependence_plot("marital.status", 
 shap_values, 
 X.iloc[100:330,:], 
 display_features=X_display.iloc[100:330,:])