{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "ml_sklearn_breastcancer_cross_validation_student_T_CI_nov2021.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jUHPI7fHd-A4"
      },
      "source": [
        "In this notebook we compute 95% Confidence Intervals for 10-Fold Cross Validation as a comparison to the Standard Error Method\n",
        "\n",
        "The standard error is the standard deviation of the Student t-distribution. T-distributions are slightly different from Gaussian, and vary depending on the size of the sample.\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "WoCsVVokfONq",
        "outputId": "62f41792-36bc-40f7-fa5d-d549d91c2aa0"
      },
      "source": [
        "!pip install git+https://github.com/pattersonconsulting/ml_tools.git"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting git+https://github.com/pattersonconsulting/ml_tools.git\n",
            "  Cloning https://github.com/pattersonconsulting/ml_tools.git to /tmp/pip-req-build-nm_jnpmr\n",
            "  Running command git clone -q https://github.com/pattersonconsulting/ml_tools.git /tmp/pip-req-build-nm_jnpmr\n",
            "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (1.1.5)\n",
            "Requirement already satisfied: sklearn in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (0.0)\n",
            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (3.2.2)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from ml-valuation==0.0.1) (1.19.5)\n",
            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (1.3.2)\n",
            "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (2.8.2)\n",
            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (2.4.7)\n",
            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->ml-valuation==0.0.1) (0.11.0)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib->ml-valuation==0.0.1) (1.15.0)\n",
            "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->ml-valuation==0.0.1) (2018.9)\n",
            "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from sklearn->ml-valuation==0.0.1) (0.22.2.post1)\n",
            "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->sklearn->ml-valuation==0.0.1) (1.4.1)\n",
            "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->sklearn->ml-valuation==0.0.1) (1.1.0)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "K6qrORWRWL9i"
      },
      "source": [
        "import pandas as pd\n",
        "\n",
        "import numpy as np\n",
        "from sklearn.metrics import confusion_matrix\n",
        "from sklearn.metrics import roc_curve, auc\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.datasets import make_classification\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "\n",
        "#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
        "from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split\n",
        "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
        "from sklearn.utils import resample\n",
        "from sklearn.dummy import DummyClassifier\n",
        "\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.datasets import load_breast_cancer\n",
        "\n",
        "from sklearn.metrics import average_precision_score\n",
        "from sklearn.metrics import precision_recall_curve\n",
        "from sklearn.metrics import plot_precision_recall_curve\n",
        "from sklearn.metrics import roc_curve, auc, confusion_matrix\n",
        "\n",
        "from matplotlib import pyplot\n",
        "\n",
        "import ml_valuation\n",
        "\n",
        "from ml_valuation import model_valuation\n",
        "from ml_valuation import model_visualization\n",
        "\n",
        "\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XfLSLa0v-6nG"
      },
      "source": [
        "arr_X, arr_y = load_breast_cancer(return_X_y=True)\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "S_n1_bDHAqn2",
        "outputId": "03003f9a-fc0c-4b00-dc5d-e36f4ac6fc06"
      },
      "source": [
        "#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n",
        "\n",
        "print(\"X: \" + str(arr_X.shape))\n",
        "#print(\"X_test: \" + str(X.shape))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "X: (569, 30)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "LqYSQbL2BkBE",
        "outputId": "7764f2a8-0d20-40f3-fb0a-38316f556aa0"
      },
      "source": [
        "print(arr_X)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]\n",
            " [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]\n",
            " [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]\n",
            " ...\n",
            " [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]\n",
            " [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]\n",
            " [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "LI_guKSLHNML",
        "outputId": "1fa40161-e536-4007-e9d6-2466bd3c378f"
      },
      "source": [
        "unique, counts = np.unique(arr_y, return_counts=True)\n",
        "dict(zip(unique, counts))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{0: 212, 1: 357}"
            ]
          },
          "metadata": {},
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RUhMb8pmkW5w",
        "outputId": "a3b9fb7a-3ab0-47ff-e32a-ea1b2a397a56"
      },
      "source": [
        "from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV, StratifiedKFold\n",
        "\n",
        "# fit a model\n",
        "classifier_kfold_LR = LogisticRegression(solver='newton-cg')\n",
        "\n",
        "k = 10\n",
        "cv = StratifiedKFold( n_splits=k )\n",
        "stats = list()\n",
        "\n",
        "X = pd.DataFrame(arr_X)\n",
        "y = pd.DataFrame(arr_y)\n",
        "#for train_index, test_index in cv.split(X, y):\n",
        "for i, (train_index, test_index) in enumerate(cv.split(X, y)):\n",
        "\n",
        "  # convert the data indexes into references\n",
        "  Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]\n",
        "  ytrain, ytest = y.iloc[train_index], y.iloc[test_index]\n",
        "\n",
        "  print(\"Running CV Fold-\" + str(i))\n",
        "  #print(Xtrain.shape)\n",
        "  #print(Xtest.shape)\n",
        "  #x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(df_full_scaled, y_train_full, test_size=1 - train_ratio)\n",
        "\n",
        "  # classifier_kfold_LR.fit( X.iloc[ train_index ], y.iloc[ train_index ])\n",
        "  # fit the model on the training data (Xtrain) and labels (ytrain)\n",
        "  classifier_kfold_LR.fit( Xtrain, ytrain.values.ravel() )\n",
        "\n",
        "  # now get the probabilites of the predictions for the text input (data: Xtest, labels: ytest)\n",
        "  #probas_ = classifier_kfold_LR.predict_proba( Xtest )\n",
        "\n",
        "  #print( \"prediction probabilities: \" + str(yhat.shape) )\n",
        "\n",
        "  \n",
        "  #prediction_est_prob = probas_[:, 1]\n",
        "\n",
        "  y_pred = classifier_kfold_LR.predict(Xtest)\n",
        "\n",
        "  accuracy_fold = accuracy_score(ytest, y_pred)\n",
        "\n",
        "  #scmtrx_lr_full_testset = model_valuation.standard_confusion_matrix_for_top_ranked_percent(y_test_scaled, yhat, 0.5, 1.0)\n",
        "  \n",
        "  \n",
        "  stats.append(accuracy_fold)\n",
        "  print(\"Accuracy: \" + str(accuracy_fold))\n",
        "\n",
        "\n",
        "  \n",
        "  print(\"-----\")\n",
        "\n",
        "mean_score = np.mean(stats)\n",
        "print(\"\\n\\nAverage Accuracy Across All Folds: \" + str(\"{:.4f}\".format(mean_score)))\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Running CV Fold-0\n",
            "Accuracy: 0.9824561403508771\n",
            "-----\n",
            "Running CV Fold-1\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/scipy/optimize/linesearch.py:314: LineSearchWarning: The line search algorithm did not converge\n",
            "  warn('The line search algorithm did not converge', LineSearchWarning)\n",
            "/usr/local/lib/python3.7/dist-packages/sklearn/utils/optimize.py:204: UserWarning: Line Search failed\n",
            "  warnings.warn('Line Search failed')\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Accuracy: 0.9122807017543859\n",
            "-----\n",
            "Running CV Fold-2\n",
            "Accuracy: 0.9298245614035088\n",
            "-----\n",
            "Running CV Fold-3\n",
            "Accuracy: 0.9473684210526315\n",
            "-----\n",
            "Running CV Fold-4\n",
            "Accuracy: 0.9824561403508771\n",
            "-----\n",
            "Running CV Fold-5\n",
            "Accuracy: 0.9824561403508771\n",
            "-----\n",
            "Running CV Fold-6\n",
            "Accuracy: 0.9298245614035088\n",
            "-----\n",
            "Running CV Fold-7\n",
            "Accuracy: 0.9473684210526315\n",
            "-----\n",
            "Running CV Fold-8\n",
            "Accuracy: 0.9649122807017544\n",
            "-----\n",
            "Running CV Fold-9\n",
            "Accuracy: 0.9642857142857143\n",
            "-----\n",
            "\n",
            "\n",
            "Average Accuracy Across All Folds: 0.9543\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "oBdHdRtTgV_d"
      },
      "source": [
        "# Calculating a 95% confidence interval for a population mean\n",
        "\n",
        "(1) determine if you need to use:     \n",
        "* the normal distribution\n",
        "* the student's t-distribution\n",
        "\n",
        "(2) if we have the sample standard deviation\n",
        "* this is an indication to use the student's t distribution\n",
        "\n",
        "(3) sample size as another indicator\n",
        "* when n >= 30 then we use the normal distribution\n",
        "* when n < 30 we use the student t distribution\n",
        "\n",
        "(4) computing confidence intervals for the population mean\n",
        "\n",
        "CIs = sample mean +/- t (v, alpha/2) * s / sqrt(n)\n",
        "\n",
        "v = n - 1\n",
        "\n",
        "alpha / 2 = (1 - CL) / 2 = (1 - 0.95) / 2 = 0.05 / 2 = 0.025\n",
        "\n",
        "t = look up in student's t-distribuion table with v and alpha/2 values\n",
        "\n",
        "t (9, 0.025) = 2.262\n",
        "\n",
        "EBM: \"margin of error\"\n",
        "\n",
        "ebm = (t) (std / sqrt(n))\n",
        "\n",
        "CI = mean +/ (t) (std / sqrt(n))\n",
        "\n",
        "CI = mean +/ ebm\n",
        "\n",
        "What does this confidence interval mean?\n",
        "\n",
        "This confidence interval means that we are 95% sure that the \"average accuracy for this model on of all breast cancer data in the full population of breast cancer data is somewhere between 93.7% and 97.1%\"\n",
        "\n",
        "## What does Confidence Interval Reveal?\n",
        "\n",
        "\"A confidence interval is a range of values, bounded above and below the statistic's mean, that likely would contain an unknown population parameter. Confidence level refers to the *percentage of probability*, or certainty, that the confidence interval would contain the true population parameter when you draw a random sample many times.\"\n",
        "\n",
        "if confidence level refers to \"percentage of probability\" of certainty, then we can assume that 95% of the time our accuracy should be between the lower and upper bound of the estimate.\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "oNC7Xie3fF5x",
        "outputId": "ea7bd86a-a868-453b-aec4-3a9144d152d1"
      },
      "source": [
        "\n",
        "\n",
        "t = 2.262 # v = 10 - 1, alpha = 0.025\n",
        "\n",
        "std_dev_sample = np.std(stats)\n",
        "print(\"\\n\\nSample STD DEV: \" + str(std_dev_sample))\n",
        "ebm = (1/np.sqrt(k)) * std_dev_sample * t\n",
        "\n",
        "\n",
        "print(\"\\n\\nEBM (Accuracy) Across All Folds: ( \" + str(\"{:.4f}\".format(ebm)) + \")\")\n",
        "\n",
        "print(\"CI Ranges 95%:\")\n",
        "\n",
        "low_end_range = mean_score - ebm\n",
        "high_end_range = mean_score + ebm\n",
        "\n",
        "print(\"High: \" + str(high_end_range))\n",
        "print(\"Low : \" + str(low_end_range))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "\n",
            "Sample STD DEV: 0.023770661464399972\n",
            "\n",
            "\n",
            "EBM (Accuracy) Across All Folds: ( 0.0170)\n",
            "CI Ranges 95%:\n",
            "High: 0.9713266337249031\n",
            "Low : 0.9373199828164502\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OtWo8JfJgUjS"
      },
      "source": [
        "These results are comparable to bootstrap632 results, for reference, on the same dataset / classifier combination\n",
        "\n",
        "Notable Links\n",
        "* https://stats.stackexchange.com/questions/549103/is-bootstrap-variance-based-on-mean-same-as-standard-error\n",
        "* https://support.minitab.com/en-us/minitab-express/1/help-and-how-to/basic-statistics/inference/how-to/resampling/bootstrapping-for-1-sample-mean/interpret-the-results/key-results/\n",
        "* https://stats.stackexchange.com/questions/231263/help-understanding-standard-error\n"
      ]
    }
  ]
}