{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "ml_sklearn_breastcancer_mlxtend_632bootstrap_nov2021.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "DnKvJqu5hXCH" }, "source": [ "This notebook is an example of using bootstrap and bootstrap632\n", "\n", "Based on\n", "\n", "http://rasbt.github.io/mlxtend/user_guide/evaluate/bootstrap_point632_score/#bootstrap_point632_score\n", "\n", "\n", "References:\n", "\n", "- [1] Efron, Bradley. 1983. \"Estimating the Error Rate\n", "of a Prediction Rule: Improvement on Cross-Validation.\"\n", "Journal of the American Statistical Association\n", "78 (382): 316. doi:10.2307/2288636.\n", "- [2] Efron, Bradley, and Robert Tibshirani. 1997.\n", "\"Improvements on Cross-Validation: The .632+ Bootstrap Method.\"\n", "Journal of the American Statistical Association\n", "92 (438): 548. doi:10.2307/2965703." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WoCsVVokfONq", "outputId": "b0ef1c15-bcdb-438a-eb33-53f5d026ccbd" }, "source": [ "!pip install git+https://github.com/pattersonconsulting/ml_tools.git" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting git+https://github.com/pattersonconsulting/ml_tools.git\n", " Cloning https://github.com/pattersonconsulting/ml_tools.git to /tmp/pip-req-build-t751ukqr\n", " Running command git clone -q https://github.com/pattersonconsulting/ml_tools.git /tmp/pip-req-build-t751ukqr\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (1.1.5)\n", "Requirement already satisfied: sklearn in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (0.0)\n", "Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (3.2.2)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (1.19.5)\n", "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (2.8.2)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (0.10.0)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (1.3.2)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (2.4.7)\n", "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from cycler>=0.10->matplotlib->ml-valuation==0.0.1) (1.15.0)\n", "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->ml-valuation==0.0.1) (2018.9)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from sklearn->ml-valuation==0.0.1) (0.22.2.post1)\n", "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->sklearn->ml-valuation==0.0.1) (1.4.1)\n", "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->sklearn->ml-valuation==0.0.1) (1.0.1)\n" ] } ] }, { "cell_type": "code", "metadata": { "id": "3_BsjXbNH3Hk" }, "source": [ "from mlxtend.evaluate import bootstrap_point632_score\n", "from mlxtend.evaluate import BootstrapOutOfBag\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "K6qrORWRWL9i" }, "source": [ "import pandas as pd\n", "\n", "import numpy as np\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import roc_curve, auc\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.datasets import make_classification\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", "from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split\n", "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", "from sklearn.utils import resample\n", "from sklearn.dummy import DummyClassifier\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.datasets import load_breast_cancer\n", "\n", "from sklearn.metrics import average_precision_score\n", "from sklearn.metrics import precision_recall_curve\n", "from sklearn.metrics import plot_precision_recall_curve\n", "from sklearn.metrics import roc_curve, auc, confusion_matrix\n", "\n", "from matplotlib import pyplot\n", "\n", "import ml_valuation\n", "\n", "from ml_valuation import model_valuation\n", "from ml_valuation import model_visualization\n", "\n", "\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "XfLSLa0v-6nG" }, "source": [ "arr_X, arr_y = load_breast_cancer(return_X_y=True)\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "S_n1_bDHAqn2", "outputId": "57122e5a-4b04-4982-8fe2-1fe3bc82e924" }, "source": [ "#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n", "\n", "print(\"X: \" + str(arr_X.shape))\n", "#print(\"X_test: \" + str(X.shape))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "X: (569, 30)\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LqYSQbL2BkBE", "outputId": "7809f161-d052-44aa-d428-820293811187" }, "source": [ "print(arr_X)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]\n", " [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]\n", " [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]\n", " ...\n", " [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]\n", " [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]\n", " [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LI_guKSLHNML", "outputId": "0a5e182e-4b9e-4818-a3c7-a95ae1b38206" }, "source": [ "unique, counts = np.unique(arr_y, return_counts=True)\n", "dict(zip(unique, counts))" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{0: 212, 1: 357}" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OPFlKwQYIoeR", "outputId": "1381aca9-ce68-458b-ebed-aa78a5f447db" }, "source": [ "X = arr_X\n", "y = arr_y\n", "\n", "#data = pd.concat(objs=[X, y], axis=1)\n", "\n", "print(X.shape)\n", "print(y.shape)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(569, 30)\n", "(569,)\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RUhMb8pmkW5w", "outputId": "511fe9e5-2703-4766-f1f4-8f5d154cf1d6" }, "source": [ "\n", "# fit a model\n", "model = LogisticRegression(solver='newton-cg')\n", "\n", "#cv = StratifiedKFold( n_splits=10 )\n", "#stats = list()\n", "\n", "\n", "#num_splits = 500\n", "#cnt = 0\n", "\n", "scores = bootstrap_point632_score(model, X, y)\n", "acc = np.mean(scores)\n", "print('Accuracy: %.2f%%' % (100*acc))\n", "\n", "\n", "# Confidence interval\n", "lower = np.percentile(scores, 2.5)\n", "upper = np.percentile(scores, 97.5)\n", "print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))\n" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n", " \"number of iterations.\", ConvergenceWarning)\n", "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n", " \"number of iterations.\", ConvergenceWarning)\n", "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n", " \"number of iterations.\", ConvergenceWarning)\n", "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n", " \"number of iterations.\", ConvergenceWarning)\n", "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n", " \"number of iterations.\", ConvergenceWarning)\n", "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n", " \"number of iterations.\", ConvergenceWarning)\n", "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:212: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.\n", " \"number of iterations.\", ConvergenceWarning)\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Accuracy: 95.67%\n", "95% Confidence interval: [94.14, 97.09]\n" ] } ] } ] }