{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "ml_sklearn_breastcancer_mlxtend_632bootstrap_nov2021.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DnKvJqu5hXCH"
      },
      "source": [
        "This notebook is an example of using bootstrap and bootstrap632\n",
        "\n",
        "Based on\n",
        "\n",
        "http://rasbt.github.io/mlxtend/user_guide/evaluate/bootstrap_point632_score/#bootstrap_point632_score\n",
        "\n",
        "\n",
        "References:\n",
        "\n",
        "- [1] Efron, Bradley. 1983. \"Estimating the Error Rate\n",
        "of a Prediction Rule: Improvement on Cross-Validation.\"\n",
        "Journal of the American Statistical Association\n",
        "78 (382): 316. doi:10.2307/2288636.\n",
        "- [2] Efron, Bradley, and Robert Tibshirani. 1997.\n",
        "\"Improvements on Cross-Validation: The .632+ Bootstrap Method.\"\n",
        "Journal of the American Statistical Association\n",
        "92 (438): 548. doi:10.2307/2965703."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "WoCsVVokfONq",
        "outputId": "b0ef1c15-bcdb-438a-eb33-53f5d026ccbd"
      },
      "source": [
        "!pip install git+https://github.com/pattersonconsulting/ml_tools.git"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting git+https://github.com/pattersonconsulting/ml_tools.git\n",
            "  Cloning https://github.com/pattersonconsulting/ml_tools.git to /tmp/pip-req-build-t751ukqr\n",
            "  Running command git clone -q https://github.com/pattersonconsulting/ml_tools.git /tmp/pip-req-build-t751ukqr\n",
            "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (1.1.5)\n",
            "Requirement already satisfied: sklearn in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (0.0)\n",
            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (3.2.2)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (1.19.5)\n",
            "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (2.8.2)\n",
            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (0.10.0)\n",
            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (1.3.2)\n",
            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (2.4.7)\n",
            "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from cycler>=0.10->matplotlib->ml-valuation==0.0.1) (1.15.0)\n",
            "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->ml-valuation==0.0.1) (2018.9)\n",
            "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from sklearn->ml-valuation==0.0.1) (0.22.2.post1)\n",
            "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->sklearn->ml-valuation==0.0.1) (1.4.1)\n",
            "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->sklearn->ml-valuation==0.0.1) (1.0.1)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3_BsjXbNH3Hk"
      },
      "source": [
        "from mlxtend.evaluate import bootstrap_point632_score\n",
        "from mlxtend.evaluate import BootstrapOutOfBag\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "K6qrORWRWL9i"
      },
      "source": [
        "import pandas as pd\n",
        "\n",
        "import numpy as np\n",
        "from sklearn.metrics import confusion_matrix\n",
        "from sklearn.metrics import roc_curve, auc\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.datasets import make_classification\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "\n",
        "#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
        "from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split\n",
        "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
        "from sklearn.utils import resample\n",
        "from sklearn.dummy import DummyClassifier\n",
        "\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.datasets import load_breast_cancer\n",
        "\n",
        "from sklearn.metrics import average_precision_score\n",
        "from sklearn.metrics import precision_recall_curve\n",
        "from sklearn.metrics import plot_precision_recall_curve\n",
        "from sklearn.metrics import roc_curve, auc, confusion_matrix\n",
        "\n",
        "from matplotlib import pyplot\n",
        "\n",
        "import ml_valuation\n",
        "\n",
        "from ml_valuation import model_valuation\n",
        "from ml_valuation import model_visualization\n",
        "\n",
        "\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XfLSLa0v-6nG"
      },
      "source": [
        "arr_X, arr_y = load_breast_cancer(return_X_y=True)\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "S_n1_bDHAqn2",
        "outputId": "57122e5a-4b04-4982-8fe2-1fe3bc82e924"
      },
      "source": [
        "#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n",
        "\n",
        "print(\"X: \" + str(arr_X.shape))\n",
        "#print(\"X_test: \" + str(X.shape))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "X: (569, 30)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "LqYSQbL2BkBE",
        "outputId": "7809f161-d052-44aa-d428-820293811187"
      },
      "source": [
        "print(arr_X)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]\n",
            " [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]\n",
            " [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]\n",
            " ...\n",
            " [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]\n",
            " [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]\n",
            " [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "LI_guKSLHNML",
        "outputId": "0a5e182e-4b9e-4818-a3c7-a95ae1b38206"
      },
      "source": [
        "unique, counts = np.unique(arr_y, return_counts=True)\n",
        "dict(zip(unique, counts))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{0: 212, 1: 357}"
            ]
          },
          "metadata": {},
          "execution_count": 7
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "OPFlKwQYIoeR",
        "outputId": "1381aca9-ce68-458b-ebed-aa78a5f447db"
      },
      "source": [
        "X = arr_X\n",
        "y = arr_y\n",
        "\n",
        "#data = pd.concat(objs=[X, y], axis=1)\n",
        "\n",
        "print(X.shape)\n",
        "print(y.shape)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "(569, 30)\n",
            "(569,)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RUhMb8pmkW5w",
        "outputId": "511fe9e5-2703-4766-f1f4-8f5d154cf1d6"
      },
      "source": [
        "\n",
        "# fit a model\n",
        "model = LogisticRegression(solver='newton-cg')\n",
        "\n",
        "#cv = StratifiedKFold( n_splits=10 )\n",
        "#stats = list()\n",
        "\n",
        "\n",
        "#num_splits = 500\n",
        "#cnt = 0\n",
        "\n",
        "scores = bootstrap_point632_score(model, X, y)\n",
        "acc = np.mean(scores)\n",
        "print('Accuracy: %.2f%%' % (100*acc))\n",
        "\n",
        "\n",
        "# Confidence interval\n",
        "lower = np.percentile(scores, 2.5)\n",
        "upper = np.percentile(scores, 97.5)\n",
        "print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n",
            "  \"number of iterations.\", ConvergenceWarning)\n",
            "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n",
            "  \"number of iterations.\", ConvergenceWarning)\n",
            "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n",
            "  \"number of iterations.\", ConvergenceWarning)\n",
            "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n",
            "  \"number of iterations.\", ConvergenceWarning)\n",
            "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n",
            "  \"number of iterations.\", ConvergenceWarning)\n",
            "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n",
            "  \"number of iterations.\", ConvergenceWarning)\n",
            "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n",
            "  \"number of iterations.\", ConvergenceWarning)\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Accuracy: 95.67%\n",
            "95% Confidence interval: [94.14, 97.09]\n"
          ]
        }
      ]
    }
  ]
}