{
  "cells": [
    {
      "metadata": {
        "_uuid": "51b9b35792a863134d05653832ddcfc9b9fc5d55"
      },
      "cell_type": "markdown",
      "source": "# Census income classification with Keras\n\nTo download a copy of this notebook visit [github](https://github.com/slundberg/shap/tree/master/notebooks)."
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "6348460e1429f5690a9a674bb424d0e3222c2180",
        "collapsed": true
      },
      "cell_type": "code",
      "source": "from sklearn.model_selection import train_test_split\nfrom keras.layers import Input, Dense, Flatten, Concatenate, concatenate, Dropout, Lambda\nfrom keras.models import Model\nfrom keras.layers.embeddings import Embedding\nfrom tqdm import tqdm\nimport shap\n\n# print the JS visualization code to the notebook\nshap.initjs()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "b0341c8c8a042cfdb8025a6cf3085680a5f29bb8"
      },
      "cell_type": "markdown",
      "source": "## Load dataset"
    },
    {
      "metadata": {
        "trusted": true,
        "collapsed": true,
        "_uuid": "a2dfb6995f1861cb752180e0c0510f2a850f0077"
      },
      "cell_type": "code",
      "source": "import pandas as pd",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "collapsed": true,
        "_uuid": "ea7795557158ad2e532c37ea1be9ac63c6a6922d"
      },
      "cell_type": "code",
      "source": "df = pd.read_csv('../input/adult.csv')",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "2e19963a553fbd4999320513c7692300cb82f458",
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df.head()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "0bbc6f358f02f1532e5f51948c7a17acc5a867a3",
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df.dtypes",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "collapsed": true,
        "_uuid": "b0337930f39dcc955467584298d284c9d372f627"
      },
      "cell_type": "code",
      "source": "X_display = df.drop('income',axis=1)\ny_display = df['income']",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "collapsed": true,
        "_uuid": "63d119fdcc100b22fef39e98ca4493b14c5c52b8"
      },
      "cell_type": "code",
      "source": "int_columns = df.select_dtypes(['int64']).columns\ndf[int_columns] = df[int_columns].astype('float32')\n\ncat_columns = df.select_dtypes(['object']).columns\ndf[cat_columns] = df[cat_columns].astype('category')\ndf[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "1daf6acd6e8e7e9d3eb751472475934a3f99ee90",
        "collapsed": true
      },
      "cell_type": "code",
      "source": "X = df.drop('income',axis=1)\ny = df['income']",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "1c51d6e50a753206296a2660ffc14f4f4817c4fd",
        "collapsed": true
      },
      "cell_type": "code",
      "source": "#X,y = shap.datasets.adult()\n#X_display,y_display = shap.datasets.adult(display=True)\n\n# normalize data (this is important for model convergence)\ndtypes = list(zip(X.dtypes.index, map(str, X.dtypes)))\nfor k,dtype in dtypes:\n    if dtype == \"float32\":\n        X[k] -= X[k].mean()\n        X[k] /= X[k].std()\n\nX_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=7)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "33985f54eb3c6b51b856319b23e994bdfe65b9e0",
        "collapsed": true
      },
      "cell_type": "code",
      "source": "X.head()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "306fd19253fc1d632983032997cbc1912b56f1ec"
      },
      "cell_type": "markdown",
      "source": "## Train Keras model"
    },
    {
      "metadata": {
        "scrolled": true,
        "trusted": true,
        "_uuid": "87f3b8d1dec4511815b14254e653128cdc251d2f",
        "collapsed": true
      },
      "cell_type": "code",
      "source": "# build model\ninput_els = []\nencoded_els = []\nfor k,dtype in dtypes:\n    input_els.append(Input(shape=(1,)))\n    if dtype == \"int8\":\n        e = Flatten()(Embedding(X_train[k].max()+1, 1)(input_els[-1]))\n    else:\n        e = input_els[-1]\n    encoded_els.append(e)\nencoded_els = concatenate(encoded_els)\nlayer1 = Dropout(0.5)(Dense(100, activation=\"relu\")(encoded_els))\nout = Dense(1)(layer1)\n\n# train model\nregression = Model(inputs=input_els, outputs=[out])\nregression.compile(optimizer=\"adam\", loss='binary_crossentropy')\nregression.fit(\n    [X_train[k].values for k,t in dtypes],\n    y_train,\n    epochs=50,\n    batch_size=512,\n    shuffle=True,\n    validation_data=([X_valid[k].values for k,t in dtypes], y_valid)\n)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "0b40c91ff32d2827687563dc4acf4dbac96632ec"
      },
      "cell_type": "markdown",
      "source": "## Explain predictions\n\nHere we take the Keras model trained above and explain why it makes different predictions for different individuals. SHAP expects model functions to take a 2D numpy array as input, so we define a wrapper function around the original Keras predict function."
    },
    {
      "metadata": {
        "trusted": true,
        "collapsed": true,
        "_uuid": "e34ed91bdcff6a302d6bd63e25865562b5312c3b"
      },
      "cell_type": "code",
      "source": "def f(X):\n    return regression.predict([X[:,i] for i in range(X.shape[1])]).flatten()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "294a0b684b28ac377ea5149261e304a66653c91b"
      },
      "cell_type": "markdown",
      "source": "### Explain a single prediction\n\nHere we use a selection of 50 samples from the dataset to represent \"typical\" feature values, and then use 500 perterbation samples to estimate the SHAP values for a given prediction. Note that this requires 500 * 50 evaluations of the model."
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "9a064f9e15039c5701e70c91a6d660338d1dd037",
        "collapsed": true
      },
      "cell_type": "code",
      "source": "explainer = shap.KernelExplainer(f, X.iloc[:100,:])\nshap_values = explainer.shap_values(X.iloc[350,:], nsamples=500)\nshap.force_plot(shap_values, X_display.iloc[350,:])",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "1fda8fb62140b0cf9364db4effbe65553c2450ea",
        "collapsed": true
      },
      "cell_type": "code",
      "source": "shap_values = explainer.shap_values(X.iloc[167,:], nsamples=500)\nshap.force_plot(shap_values, X_display.iloc[167,:])",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "655321b925d8c4f728552145a026d4b380c3b88a"
      },
      "cell_type": "markdown",
      "source": "### Explain many predictions\n\nHere we repeat the above explanation process for 50 individuals. Since we are using a sampling based approximation each explanation can take a couple seconds depending on your machine setup."
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "b7a22507b6192e8b57e414711fdd66bdcf07fd0c",
        "collapsed": true
      },
      "cell_type": "code",
      "source": "shap_values = explainer.shap_values(X.iloc[100:330,:], nsamples=500)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "a1bd82f19f873a46faf276d4cfaf2324ae8d275b",
        "collapsed": true
      },
      "cell_type": "code",
      "source": "shap.force_plot(shap_values, X_display.iloc[100:330,:])",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "7835b6813b830531c6ef075d5a8fabe4650bf4ba",
        "collapsed": true
      },
      "cell_type": "code",
      "source": "shap.summary_plot(shap_values50, X.iloc[100:330,:])",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "_uuid": "9e8c0b215a7420d5cc19a66be46657f1664a9250",
        "collapsed": true
      },
      "cell_type": "code",
      "source": "shap.dependence_plot(\"marital.status\", \n                     shap_values, \n                     X.iloc[100:330,:], \n                     display_features=X_display.iloc[100:330,:])",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "collapsed": true,
        "_uuid": "0c69511164b8753e8cb5e34c7dcbd9d23c246e4d"
      },
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "anaconda-cloud": {},
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.6.5",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 1
}