{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "bukochgPFg7s"
   },
   "source": [
    "# Getting Started with tabular data!\n",
    "[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/understandable-machine-intelligence-lab/Quantus/main?labpath=tutorials%2FTutorial_Getting_Started_with_Tabular_Data.ipynb)\n",
    "\n",
    "\n",
    "This notebook shows how to get started with Quantus using tabular data. For this purpose, we use the classic Titanic tabular dataset (Frank E. Harrell Jr., Thomas Cason):\n",
    "\n",
    "https://www.openml.org/d/40945\n",
    "\n",
    "The model in this notebook is taken from \"Getting started with Captum - Titanic Data Analysis\" provided by Captum:\n",
    "\n",
    "https://captum.ai/tutorials/Titanic_Basic_Interpret"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "from IPython.display import clear_output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "4Y7_mNf9Bic0"
   },
   "outputs": [],
   "source": [
    "!pip install quantus torch captum tensorflow-datasets pandas\n",
    "\n",
    "clear_output()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "RV7X-Ss9-16F"
   },
   "outputs": [],
   "source": [
    "import pathlib\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "import quantus\n",
    "from captum.attr import IntegratedGradients\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "\n",
    "torch.manual_seed(27)\n",
    "\n",
    "clear_output()\n",
    "\n",
    "np.random.seed(27)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "mGhP4bTuoWYF"
   },
   "source": [
    "## 1) Preliminaries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XqKzag4VFjHT"
   },
   "source": [
    "### 1.1 Load datasets\n",
    "\n",
    "We load the dataset using the tensorflow-datasets library. Alternatively, it can be downloaded directly from the OpenML website: https://www.openml.org/d/40945"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "TmsZxFhuc0mm"
   },
   "outputs": [],
   "source": [
    "# Load datasets\n",
    "df = pd.read_csv(\"assets/titanic3.csv\")\n",
    "df = df[[\"age\", \"embarked\", \"fare\", \"parch\", \"pclass\", \"sex\", \"sibsp\", \"survived\"]]\n",
    "df[\"age\"] = df[\"age\"].fillna(df[\"age\"].mean())\n",
    "df[\"fare\"] = df[\"fare\"].fillna(df[\"fare\"].mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "               age         fare        parch       pclass        sibsp  \\\ncount  1309.000000  1309.000000  1309.000000  1309.000000  1309.000000   \nmean     29.881138    33.295479     0.385027     2.294882     0.498854   \nstd      12.883193    51.738879     0.865560     0.837836     1.041658   \nmin       0.170000     0.000000     0.000000     1.000000     0.000000   \n25%      22.000000     7.895800     0.000000     2.000000     0.000000   \n50%      29.881138    14.454200     0.000000     3.000000     0.000000   \n75%      35.000000    31.275000     0.000000     3.000000     1.000000   \nmax      80.000000   512.329200     9.000000     3.000000     8.000000   \n\n          survived  \ncount  1309.000000  \nmean      0.381971  \nstd       0.486055  \nmin       0.000000  \n25%       0.000000  \n50%       0.000000  \n75%       1.000000  \nmax       1.000000  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>age</th>\n      <th>fare</th>\n      <th>parch</th>\n      <th>pclass</th>\n      <th>sibsp</th>\n      <th>survived</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>count</th>\n      <td>1309.000000</td>\n      <td>1309.000000</td>\n      <td>1309.000000</td>\n      <td>1309.000000</td>\n      <td>1309.000000</td>\n      <td>1309.000000</td>\n    </tr>\n    <tr>\n      <th>mean</th>\n      <td>29.881138</td>\n      <td>33.295479</td>\n      <td>0.385027</td>\n      <td>2.294882</td>\n      <td>0.498854</td>\n      <td>0.381971</td>\n    </tr>\n    <tr>\n      <th>std</th>\n      <td>12.883193</td>\n      <td>51.738879</td>\n      <td>0.865560</td>\n      <td>0.837836</td>\n      <td>1.041658</td>\n      <td>0.486055</td>\n    </tr>\n    <tr>\n      <th>min</th>\n      <td>0.170000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>1.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>25%</th>\n      <td>22.000000</td>\n      <td>7.895800</td>\n      <td>0.000000</td>\n      <td>2.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>50%</th>\n      <td>29.881138</td>\n      <td>14.454200</td>\n      <td>0.000000</td>\n      <td>3.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>75%</th>\n      <td>35.000000</td>\n      <td>31.275000</td>\n      <td>0.000000</td>\n      <td>3.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n    </tr>\n    <tr>\n      <th>max</th>\n      <td>80.000000</td>\n      <td>512.329200</td>\n      <td>9.000000</td>\n      <td>3.000000</td>\n      <td>8.000000</td>\n      <td>1.000000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Data statistics\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# One-hot encode categorical variables\n",
    "df_enc = pd.get_dummies(df, columns=[\"embarked\", \"pclass\", \"sex\"]).sample(frac=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pandas dataframes to numpy arrays\n",
    "X = df_enc.drop([\"survived\"], axis=1).values\n",
    "Y = df_enc[\"survived\"].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create train and test set\n",
    "train_features, test_features, train_labels, test_labels = train_test_split(\n",
    "    X, Y, test_size=0.3\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "vmccxpA0n6MY"
   },
   "source": [
    "### 1.2 Train a model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The model is based on \"Getting started with Captum - Titanic Data Analysis\" provided by Captum:\n",
    "\n",
    "https://captum.ai/tutorials/Titanic_Basic_Interpret"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "class TitanicSimpleNNModel(nn.Module):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "        self.linear1 = nn.Linear(12, 12)\n",
    "        self.sigmoid1 = nn.Sigmoid()\n",
    "        self.linear2 = nn.Linear(12, 8)\n",
    "        self.sigmoid2 = nn.Sigmoid()\n",
    "        self.linear3 = nn.Linear(8, 2)\n",
    "        self.softmax = nn.Softmax(dim=1)\n",
    "\n",
    "    def forward(self, x):\n",
    "        lin1_out = self.linear1(x)\n",
    "        sigmoid_out1 = self.sigmoid1(lin1_out)\n",
    "        sigmoid_out2 = self.sigmoid2(self.linear2(sigmoid_out1))\n",
    "        return self.softmax(self.linear3(sigmoid_out2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/200 => Loss: 0.72\n",
      "Epoch 21/200 => Loss: 0.57\n",
      "Epoch 41/200 => Loss: 0.50\n",
      "Epoch 61/200 => Loss: 0.48\n",
      "Epoch 81/200 => Loss: 0.48\n",
      "Epoch 101/200 => Loss: 0.47\n",
      "Epoch 121/200 => Loss: 0.47\n",
      "Epoch 141/200 => Loss: 0.49\n",
      "Epoch 161/200 => Loss: 0.47\n",
      "Epoch 181/200 => Loss: 0.47\n"
     ]
    }
   ],
   "source": [
    "net = TitanicSimpleNNModel()\n",
    "\n",
    "criterion = nn.CrossEntropyLoss()\n",
    "num_epochs = 200\n",
    "\n",
    "optimizer = torch.optim.Adam(net.parameters(), lr=0.1)\n",
    "input_tensor = torch.from_numpy(train_features).type(torch.FloatTensor)\n",
    "label_tensor = torch.from_numpy(train_labels)\n",
    "for epoch in range(num_epochs):\n",
    "    output = net(input_tensor)\n",
    "    loss = criterion(output, label_tensor)\n",
    "    optimizer.zero_grad()\n",
    "    loss.backward()\n",
    "    optimizer.step()\n",
    "    if epoch % 20 == 0:\n",
    "        print(\"Epoch {}/{} => Loss: {:.2f}\".format(epoch + 1, num_epochs, loss.item()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train Accuracy: 0.8384279475982532\n"
     ]
    }
   ],
   "source": [
    "out_probs = net(input_tensor).detach().numpy()\n",
    "out_classes = np.argmax(out_probs, axis=1)\n",
    "print(\"Train Accuracy:\", sum(out_classes == train_labels) / len(train_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test Accuracy: 0.7684478371501272\n"
     ]
    }
   ],
   "source": [
    "test_input_tensor = torch.from_numpy(test_features).type(torch.FloatTensor)\n",
    "out_probs = net(test_input_tensor).detach().numpy()\n",
    "out_classes = np.argmax(out_probs, axis=1)\n",
    "print(\"Test Accuracy:\", sum(out_classes == test_labels) / len(test_labels))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "4vY9mZQanaxr"
   },
   "source": [
    "### 1.3 Generate explanations\n",
    "\n",
    "In this example, we rely on the `captum` library. We use the Integrated Gradients method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "ig = IntegratedGradients(net)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_input_tensor.requires_grad_()\n",
    "attr, delta = ig.attribute(test_input_tensor, target=1, return_convergence_delta=True)\n",
    "attr = attr.detach().numpy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "tuBkEBv3mihR"
   },
   "source": [
    "## 2) Quantative evaluation using Quantus\n",
    "\n",
    "We can evaluate our explanations on a variety of quantuative criteria but as a motivating example we test the ModelParameterRandomisation scores by Adebayo et al., 2018 and Complexity Bhatt et al., 2020.\n",
    "\n",
    "The ModelParameterRandomisation metric measures the distance between the original attribution and a newly computed attribution throughout the process of cascadingly/independently randomizing the model parameters of one layer at a time.\n",
    "\n",
    "The Complexity of attributions is defined as the entropy of the fractional contribution of feature x_i to the total\n",
    "magnitude of the attribution."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "id": "iq7qqDfSmIdj"
   },
   "outputs": [
    {
     "data": {
      "text/plain": "  0%|          | 0/3 [00:00<?, ?it/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "89d7a3ad15524661ab501cc1de95c67d"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ModelParameterRandomisation scores by Adebayo et al., 2018\n",
      "\n",
      " • Integrated Gradient =  [0.9428662149762941]\n"
     ]
    }
   ],
   "source": [
    "# Return ModelParameterRandomisation scores for Integrated Gradients.\n",
    "scores_intgrad = quantus.ModelParameterRandomisation(\n",
    "    similarity_func=quantus.similarity_func.correlation_spearman,\n",
    "    return_sample_correlation=True,\n",
    "    return_aggregate=True,\n",
    "    aggregate_func=np.mean,\n",
    "    layer_order=\"independent\",\n",
    "    disable_warnings=True,\n",
    "    normalise=True,\n",
    "    abs=True,\n",
    "    display_progressbar=True,\n",
    ")(\n",
    "    model=net,\n",
    "    x_batch=test_features,\n",
    "    y_batch=test_labels,\n",
    "    a_batch=None,\n",
    "    explain_func=quantus.explain,\n",
    "    explain_func_kwargs={\"method\": \"IntegratedGradients\", \"reduce_axes\": ()},\n",
    ")\n",
    "print(\n",
    "    f\"ModelParameterRandomisation scores by Adebayo et al., 2018\\n\"\n",
    "    f\"\\n • Integrated Gradient = \",\n",
    "    scores_intgrad,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 4,
     "status": "ok",
     "timestamp": 1665171594087,
     "user": {
      "displayName": "Anna Hedström",
      "userId": "05540180366077551505"
     },
     "user_tz": -120
    },
    "id": "3kBrG51Lpuq9",
    "outputId": "5de8efa5-03ac-433d-ec7b-ead66740a545"
   },
   "outputs": [
    {
     "data": {
      "text/plain": "Evaluating Complexity:   0%|          | 0/393 [00:00<?, ?it/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "403007ec5e3443d1bb125b28a6fe0e17"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Complexity Bhatt et al., 2020.\n",
      "\n",
      " • Integrated Gradient =  [1.3059633285078698]\n"
     ]
    }
   ],
   "source": [
    "complexity_intgrad = quantus.Complexity(\n",
    "    normalise=True,\n",
    "    abs=True,\n",
    "    disable_warnings=True,\n",
    "    display_progressbar=True,\n",
    "    return_aggregate=True\n",
    ")(\n",
    "    model=net,\n",
    "    x_batch=test_features,\n",
    "    y_batch=test_labels,\n",
    "    a_batch=None,\n",
    "    explain_func=quantus.explain,\n",
    "    explain_func_kwargs={\"method\": \"IntegratedGradients\", \"reduce_axes\": ()},\n",
    ")\n",
    "\n",
    "print(\n",
    "    f\"Complexity Bhatt et al., 2020.\\n\"\n",
    "    f\"\\n • Integrated Gradient = \",\n",
    "    complexity_intgrad,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "machine_shape": "hm",
   "provenance": []
  },
  "gpuClass": "standard",
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}