""" Getting started with categorical data ===================================== Experimental support for categorical data. In before, users need to run an encoder themselves before passing the data into XGBoost, which creates a sparse matrix and potentially increase memory usage. This demo showcases the experimental categorical data support, more advanced features are planned. .. versionadded:: 1.5.0 See Also -------- - :doc:`Tutorial ` - :ref:`sphx_glr_python_examples_cat_in_the_dat.py` - :ref:`sphx_glr_python_examples_cat_pipeline.py` """ from typing import Tuple import numpy as np import pandas as pd import xgboost as xgb def make_categorical( n_samples: int, n_features: int, n_categories: int, onehot: bool ) -> Tuple[pd.DataFrame, pd.Series]: """Make some random data for demo.""" rng = np.random.RandomState(1994) pd_dict = {} for i in range(n_features + 1): c = rng.randint(low=0, high=n_categories, size=n_samples) pd_dict[str(i)] = pd.Series(c, dtype=np.int64) df = pd.DataFrame(pd_dict) label = df.iloc[:, 0] df = df.iloc[:, 1:] for i in range(0, n_features): label += df.iloc[:, i] label += 1 df = df.astype("category") categories = np.arange(0, n_categories) for col in df.columns: df[col] = df[col].cat.set_categories(categories) if onehot: return pd.get_dummies(df), label return df, label def main() -> None: # Use builtin categorical data support # For scikit-learn interface, the input data should be pandas DataFrame or cudf # DataFrame with categorical features. If an numpy/cupy array is used instead, the # `feature_types` for `XGBRegressor` should be set accordingly. X, y = make_categorical(100, 10, 4, False) # Specify `enable_categorical` to True, also we use onehot-encoding-based split here # for demonstration. For details see the document of `max_cat_to_onehot`. reg = xgb.XGBRegressor( tree_method="hist", enable_categorical=True, max_cat_to_onehot=5, device="cuda" ) reg.fit(X, y, eval_set=[(X, y)]) # Pass in already encoded data X_enc, y_enc = make_categorical(100, 10, 4, True) reg_enc = xgb.XGBRegressor(tree_method="hist", device="cuda") reg_enc.fit(X_enc, y_enc, eval_set=[(X_enc, y_enc)]) reg_results = np.array(reg.evals_result()["validation_0"]["rmse"]) reg_enc_results = np.array(reg_enc.evals_result()["validation_0"]["rmse"]) # Check that they have same results np.testing.assert_allclose(reg_results, reg_enc_results) # Convert to DMatrix for SHAP value booster: xgb.Booster = reg.get_booster() m = xgb.DMatrix(X, enable_categorical=True) # specify categorical data support. SHAP = booster.predict(m, pred_contribs=True) margin = booster.predict(m, output_margin=True) np.testing.assert_allclose( np.sum(SHAP, axis=len(SHAP.shape) - 1), margin, rtol=1e-3 ) if __name__ == "__main__": main()