{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# install interpret if not already installed\n", "try:\n", " import interpret\n", "except ModuleNotFoundError:\n", " !pip install --quiet interpret numpy pandas scikit-learn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "\n", "df = pd.read_csv(\n", " \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n", " header=None)\n", "df.columns = [\n", " \"Age\", \"WorkClass\", \"fnlwgt\", \"Education\", \"EducationNum\",\n", " \"MaritalStatus\", \"Occupation\", \"Relationship\", \"Race\", \"Gender\",\n", " \"CapitalGain\", \"CapitalLoss\", \"HoursPerWeek\", \"NativeCountry\", \"Income\"\n", "]\n", "X = df.iloc[:, :-1]\n", "y = df.iloc[:, -1]\n", "\n", "# Set feature_types manually instead of relying on auto-detect\n", "feature_types = ['continuous', 'nominal', 'continuous', 'nominal',\n", " 'continuous', 'nominal', 'nominal', 'nominal', 'nominal', 'nominal',\n", " 'continuous', 'continuous', 'continuous', 'nominal']\n", "\n", "privacy_bounds = {\"Age\": (17, 90), \"fnlwgt\": (12285, 1484705), \n", " \"EducationNum\": (1, 16), \"CapitalGain\": (0, 99999), \n", " \"CapitalLoss\": (0, 4356), \"HoursPerWeek\": (1, 99)\n", "}\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fit and compare DP-EBM vs. standard EBM" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from interpret.privacy import DPExplainableBoostingClassifier\n", "from interpret.glassbox import ExplainableBoostingClassifier\n", "from interpret import show\n", "\n", "from sklearn.metrics import roc_auc_score\n", "\n", "dpebm = DPExplainableBoostingClassifier(random_state=None, epsilon=1, delta=1e-6, \n", " feature_types=feature_types, privacy_bounds=privacy_bounds)\n", "dpebm.fit(X_train, y_train)\n", "dp_auc = roc_auc_score(y_test, dpebm.predict_proba(X_test)[:, 1])\n", "print(\"DPEBM AUC: {:.3f}\".format(dp_auc))\n", "\n", "ebm = ExplainableBoostingClassifier()\n", "ebm.fit(X_train, y_train)\n", "ebm_auc = roc_auc_score(y_test, ebm.predict_proba(X_test)[:, 1])\n", "print(\"EBM AUC: {:.3f}\".format(ebm_auc))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## See differences in learned shape functions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "show(dpebm.explain_global(name='DP EBM'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "show(ebm.explain_global(name='Standard EBM'))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }