""" ======================================== Comparison of Calibration of Classifiers ======================================== Well calibrated classifiers are probabilistic classifiers for which the output of :term:`predict_proba` can be directly interpreted as a confidence level. For instance, a well calibrated (binary) classifier should classify the samples such that for the samples to which it gave a :term:`predict_proba` value close to 0.8, approximately 80% actually belong to the positive class. In this example we will compare the calibration of four different models: :ref:`Logistic_regression`, :ref:`gaussian_naive_bayes`, :ref:`Random Forest Classifier ` and :ref:`Linear SVM `. """ # %% # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause # # Dataset # ------- # # We will use a synthetic binary classification dataset with 100,000 samples # and 20 features. Of the 20 features, only 2 are informative, 2 are # redundant (random combinations of the informative features) and the # remaining 16 are uninformative (random numbers). # # Of the 100,000 samples, 100 will be used for model fitting and the remaining # for testing. Note that this split is quite unusual: the goal is to obtain # stable calibration curve estimates for models that are potentially prone to # overfitting. In practice, one should rather use cross-validation with more # balanced splits but this would make the code of this example more complicated # to follow. from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split X, y = make_classification( n_samples=100_000, n_features=20, n_informative=2, n_redundant=2, random_state=42 ) train_samples = 100 # Samples used for training the models X_train, X_test, y_train, y_test = train_test_split( X, y, shuffle=False, test_size=100_000 - train_samples, ) # %% # Calibration curves # ------------------ # # Below, we train each of the four models with the small training dataset, then # plot calibration curves (also known as reliability diagrams) using # predicted probabilities of the test dataset. Calibration curves are created # by binning predicted probabilities, then plotting the mean predicted # probability in each bin against the observed frequency ('fraction of # positives'). Below the calibration curve, we plot a histogram showing # the distribution of the predicted probabilities or more specifically, # the number of samples in each predicted probability bin. import numpy as np from sklearn.svm import LinearSVC class NaivelyCalibratedLinearSVC(LinearSVC): """LinearSVC with `predict_proba` method that naively scales `decision_function` output.""" def fit(self, X, y): super().fit(X, y) df = self.decision_function(X) self.df_min_ = df.min() self.df_max_ = df.max() def predict_proba(self, X): """Min-max scale output of `decision_function` to [0,1].""" df = self.decision_function(X) calibrated_df = (df - self.df_min_) / (self.df_max_ - self.df_min_) proba_pos_class = np.clip(calibrated_df, 0, 1) proba_neg_class = 1 - proba_pos_class proba = np.c_[proba_neg_class, proba_pos_class] return proba # %% from sklearn.calibration import CalibrationDisplay from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegressionCV from sklearn.naive_bayes import GaussianNB # Define the classifiers to be compared in the study. # # Note that we use a variant of the logistic regression model that can # automatically tune its regularization parameter. # # For a fair comparison, we should run a hyper-parameter search for all the # classifiers but we don't do it here for the sake of keeping the example code # concise and fast to execute. lr = LogisticRegressionCV( Cs=np.logspace(-6, 6, 101), cv=10, scoring="neg_log_loss", max_iter=1_000 ) gnb = GaussianNB() svc = NaivelyCalibratedLinearSVC(C=1.0) rfc = RandomForestClassifier(random_state=42) clf_list = [ (lr, "Logistic Regression"), (gnb, "Naive Bayes"), (svc, "SVC"), (rfc, "Random forest"), ] # %% import matplotlib.pyplot as plt from matplotlib.gridspec import GridSpec fig = plt.figure(figsize=(10, 10)) gs = GridSpec(4, 2) colors = plt.get_cmap("Dark2") ax_calibration_curve = fig.add_subplot(gs[:2, :2]) calibration_displays = {} markers = ["^", "v", "s", "o"] for i, (clf, name) in enumerate(clf_list): clf.fit(X_train, y_train) display = CalibrationDisplay.from_estimator( clf, X_test, y_test, n_bins=10, name=name, ax=ax_calibration_curve, color=colors(i), marker=markers[i], ) calibration_displays[name] = display ax_calibration_curve.grid() ax_calibration_curve.set_title("Calibration plots") # Add histogram grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)] for i, (_, name) in enumerate(clf_list): row, col = grid_positions[i] ax = fig.add_subplot(gs[row, col]) ax.hist( calibration_displays[name].y_prob, range=(0, 1), bins=10, label=name, color=colors(i), ) ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count") plt.tight_layout() plt.show() # %% # # Analysis of the results # ----------------------- # # :class:`~sklearn.linear_model.LogisticRegressionCV` returns reasonably well # calibrated predictions despite the small training set size: its reliability # curve is the closest to the diagonal among the four models. # # Logistic regression is trained by minimizing the log-loss which is a strictly # proper scoring rule: in the limit of infinite training data, strictly proper # scoring rules are minimized by the model that predicts the true conditional # probabilities. That (hypothetical) model would therefore be perfectly # calibrated. However, using a proper scoring rule as training objective is not # sufficient to guarantee a well-calibrated model by itself: even with a very # large training set, logistic regression could still be poorly calibrated, if # it was too strongly regularized or if the choice and preprocessing of input # features made this model mis-specified (e.g. if the true decision boundary of # the dataset is a highly non-linear function of the input features). # # In this example the training set was intentionally kept very small. In this # setting, optimizing the log-loss can still lead to poorly calibrated models # because of overfitting. To mitigate this, the # :class:`~sklearn.linear_model.LogisticRegressionCV` class was configured to # tune the `C` regularization parameter to also minimize the log-loss via inner # cross-validation so as to find the best compromise for this model in the # small training set setting. # # Because of the finite training set size and the lack of guarantee for # well-specification, we observe that the calibration curve of the logistic # regression model is close but not perfectly on the diagonal. The shape of the # calibration curve of this model can be interpreted as slightly # under-confident: the predicted probabilities are a bit too close to 0.5 # compared to the true fraction of positive samples. # # The other methods all output less well calibrated probabilities: # # * :class:`~sklearn.naive_bayes.GaussianNB` tends to push probabilities to 0 # or 1 (see histogram) on this particular dataset (over-confidence). This is # mainly because the naive Bayes equation only provides correct estimate of # probabilities when the assumption that features are conditionally # independent holds [2]_. However, features can be correlated and this is the case # with this dataset, which contains 2 features generated as random linear # combinations of the informative features. These correlated features are # effectively being 'counted twice', resulting in pushing the predicted # probabilities towards 0 and 1 [3]_. Note, however, that changing the seed # used to generate the dataset can lead to widely varying results for the # naive Bayes estimator. # # * :class:`~sklearn.svm.LinearSVC` is not a natural probabilistic classifier. # In order to interpret its prediction as such, we naively scaled the output # of the :term:`decision_function` into [0, 1] by applying min-max scaling in # the `NaivelyCalibratedLinearSVC` wrapper class defined above. This # estimator shows a typical sigmoid-shaped calibration curve on this data: # predictions larger than 0.5 correspond to samples with an even larger # effective positive class fraction (above the diagonal), while predictions # below 0.5 corresponds to even lower positive class fractions (below the # diagonal). This under-confident predictions are typical for maximum-margin # methods [1]_. # # * :class:`~sklearn.ensemble.RandomForestClassifier`'s prediction histogram # shows peaks at approx. 0.2 and 0.9 probability, while probabilities close to # 0 or 1 are very rare. An explanation for this is given by [1]_: # "Methods such as bagging and random forests that average # predictions from a base set of models can have difficulty making # predictions near 0 and 1 because variance in the underlying base models # will bias predictions that should be near zero or one away from these # values. Because predictions are restricted to the interval [0, 1], errors # caused by variance tend to be one-sided near zero and one. For example, if # a model should predict p = 0 for a case, the only way bagging can achieve # this is if all bagged trees predict zero. If we add noise to the trees that # bagging is averaging over, this noise will cause some trees to predict # values larger than 0 for this case, thus moving the average prediction of # the bagged ensemble away from 0. We observe this effect most strongly with # random forests because the base-level trees trained with random forests # have relatively high variance due to feature subsetting." This effect can # make random forests under-confident. Despite this possible bias, note that # the trees themselves are fit by minimizing either the Gini or Entropy # criterion, both of which lead to splits that minimize proper scoring rules: # the Brier score or the log-loss respectively. See :ref:`the user guide # ` for more details. This can explain why # this model shows a good enough calibration curve on this particular example # dataset. Indeed the Random Forest model is not significantly more # under-confident than the Logistic Regression model. # # Feel free to re-run this example with different random seeds and other # dataset generation parameters to see how different the calibration plots can # look. In general, Logistic Regression and Random Forest will tend to be the # best calibrated classifiers, while SVC will often display the typical # under-confident miscalibration. The naive Bayes model is also often poorly # calibrated but the general shape of its calibration curve can vary widely # depending on the dataset. # # Finally, note that for some dataset seeds, all models are poorly calibrated, # even when tuning the regularization parameter as above. This is bound to # happen when the training size is too small or when the model is severely # misspecified. # # References # ---------- # # .. [1] `Predicting Good Probabilities with Supervised Learning # `_, A. # Niculescu-Mizil & R. Caruana, ICML 2005 # # .. [2] `Beyond independence: Conditions for the optimality of the simple # bayesian classifier # `_ # Domingos, P., & Pazzani, M., Proc. 13th Intl. Conf. Machine Learning. # 1996. # # .. [3] `Obtaining calibrated probability estimates from decision trees and # naive Bayesian classifiers # `_ # Zadrozny, Bianca, and Charles Elkan. Icml. Vol. 1. 2001.