{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Test For `DriftCheckerEstimator`-`pydrift` \n", "\n", "We're going to test how it works with the famous titanic dataset\n", "\n", "# Dependencies" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from sklearn import set_config\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.compose import make_column_transformer\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import OrdinalEncoder\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import roc_auc_score\n", "from catboost import CatBoostClassifier\n", "\n", "from pydrift import DriftCheckerEstimator\n", "from pydrift.exceptions import ColumnsNotMatchException\n", "from pydrift.constants import PATH_DATA, RANDOM_STATE\n", "from pydrift.models import cat_features_fillna\n", "from pydrift.exceptions import DriftEstimatorException\n", "\n", "\n", "set_config(display='diagram')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Read Data " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df_titanic = pd.read_csv(PATH_DATA / 'titanic.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Constants " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "DATA_LENGTH = df_titanic.shape[0]\n", "TARGET = 'Survived'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Split\n", "\n", "50% sample will give us a non-drift problem\n", "\n", "We drop Ticket and Cabin features because of cardinality" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "X = df_titanic.drop(columns=['Ticket', 'Cabin', 'PassengerId', 'Name', TARGET])\n", "y = df_titanic[TARGET]\n", "\n", "cat_features = (X\n", " .select_dtypes(include=['category', 'object'])\n", " .columns)\n", "\n", "X_filled = cat_features_fillna(X, cat_features)\n", "\n", "X_train_filled, X_test_filled, y_train, y_test = train_test_split(\n", " X_filled, y, test_size=.5, random_state=RANDOM_STATE, stratify=y\n", ")\n", "\n", "catboost_classifier = CatBoostClassifier(\n", " num_trees=5,\n", " max_depth=3,\n", " cat_features=cat_features,\n", " random_state=RANDOM_STATE,\n", " verbose=False\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Build Pipeline With DriftCheckerEstimator\n", "\n", "Catboost as estimator" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('driftcheckerestimator',\n",
       "                 DriftCheckerEstimator(column_names=Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object'),\n",
       "                                       ml_classifier_model=))])
DriftCheckerEstimator(column_names=Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object'),\n",
       "                      ml_classifier_model=)
" ], "text/plain": [ "Pipeline(steps=[('driftcheckerestimator',\n", " DriftCheckerEstimator(column_names=Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object'),\n", " ml_classifier_model=))])" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pipeline_catboost_drift_checker = make_pipeline(\n", " DriftCheckerEstimator(ml_classifier_model=catboost_classifier, column_names=X.columns)\n", ")\n", "\n", "display(pipeline_catboost_drift_checker)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Let´s Fit And Predict" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AUC training data: 0.86\n", "AUC testing data: 0.84\n" ] } ], "source": [ "pipeline_catboost_drift_checker.fit(X_train_filled, y_train)\n", "\n", "y_score_train = pipeline_catboost_drift_checker.predict_proba(X_train_filled)[:, 1]\n", "y_score_test = pipeline_catboost_drift_checker.predict_proba(X_test_filled)[:, 1]\n", "\n", "auc_train = roc_auc_score(y_true=y_train, y_score=y_score_train)\n", "auc_test = roc_auc_score(y_true=y_test, y_score=y_score_test)\n", "\n", "print(f'AUC training data: {auc_train:.2f}')\n", "print(f'AUC testing data: {auc_test:.2f}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Same With Logistic Regression Pipeline " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('columntransformer',\n",
       "                 ColumnTransformer(transformers=[('pipeline',\n",
       "                                                  Pipeline(steps=[('simpleimputer',\n",
       "                                                                   SimpleImputer(strategy='most_frequent')),\n",
       "                                                                  ('ordinalencoder',\n",
       "                                                                   OrdinalEncoder())]),\n",
       "                                                  Index(['Sex', 'Embarked'], dtype='object')),\n",
       "                                                 ('simpleimputer',\n",
       "                                                  SimpleImputer(strategy='median'),\n",
       "                                                  Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object'))])),\n",
       "                ('driftcheckerestimator',\n",
       "                 DriftCheckerEstimator(column_names=Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object'),\n",
       "                                       ml_classifier_model=LogisticRegression(max_iter=1000,\n",
       "                                                                              random_state=1994)))])
ColumnTransformer(transformers=[('pipeline',\n",
       "                                 Pipeline(steps=[('simpleimputer',\n",
       "                                                  SimpleImputer(strategy='most_frequent')),\n",
       "                                                 ('ordinalencoder',\n",
       "                                                  OrdinalEncoder())]),\n",
       "                                 Index(['Sex', 'Embarked'], dtype='object')),\n",
       "                                ('simpleimputer',\n",
       "                                 SimpleImputer(strategy='median'),\n",
       "                                 Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object'))])
Index(['Sex', 'Embarked'], dtype='object')
SimpleImputer(strategy='most_frequent')
OrdinalEncoder()
Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
SimpleImputer(strategy='median')
DriftCheckerEstimator(column_names=Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object'),\n",
       "                      ml_classifier_model=LogisticRegression(max_iter=1000,\n",
       "                                                             random_state=1994))
LogisticRegression(max_iter=1000, random_state=1994)
" ], "text/plain": [ "Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(transformers=[('pipeline',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('ordinalencoder',\n", " OrdinalEncoder())]),\n", " Index(['Sex', 'Embarked'], dtype='object')),\n", " ('simpleimputer',\n", " SimpleImputer(strategy='median'),\n", " Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object'))])),\n", " ('driftcheckerestimator',\n", " DriftCheckerEstimator(column_names=Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object'),\n", " ml_classifier_model=LogisticRegression(max_iter=1000,\n", " random_state=1994)))])" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=.5, random_state=RANDOM_STATE, stratify=y\n", ")\n", "\n", "categorical_pipeline = make_pipeline(\n", " SimpleImputer(strategy='most_frequent'),\n", " OrdinalEncoder()\n", ")\n", "\n", "column_transformer = make_column_transformer(\n", " (categorical_pipeline, cat_features),\n", " (SimpleImputer(strategy='median'), X_train.select_dtypes(include='number').columns)\n", ")\n", "\n", "pipeline_lr_drift_checker = make_pipeline(\n", " column_transformer,\n", " DriftCheckerEstimator(\n", " ml_classifier_model=LogisticRegression(max_iter=1000,\n", " random_state=RANDOM_STATE),\n", " column_names=X.columns\n", " )\n", ")\n", "\n", "display(pipeline_lr_drift_checker)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Let´s Fit And Predict " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AUC training data: 0.851\n", "AUC testing data: 0.855\n" ] } ], "source": [ "pipeline_lr_drift_checker.fit(X_train, y_train)\n", "\n", "y_score_train = pipeline_lr_drift_checker.predict_proba(X_train)[:, 1]\n", "y_score_test = pipeline_lr_drift_checker.predict_proba(X_test)[:, 1]\n", "\n", "auc_train = roc_auc_score(y_true=y_train, y_score=y_score_train)\n", "auc_test = roc_auc_score(y_true=y_test, y_score=y_score_test)\n", "\n", "print(f'AUC training data: {auc_train:.3f}')\n", "print(f'AUC testing data: {auc_test:.3f}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Ok, Now With Drifted Data " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "X = df_titanic.drop(columns=['Ticket', 'Cabin', 'PassengerId', 'Name', TARGET])\n", "y = df_titanic[TARGET]\n", "\n", "cat_features = (X\n", " .select_dtypes(include=['category', 'object'])\n", " .columns)\n", "\n", "X_filled = cat_features_fillna(X, cat_features)\n", "\n", "X_train_filled, X_test_filled, y_train, y_test = train_test_split(\n", " X_filled, y, test_size=.5, random_state=RANDOM_STATE, stratify=y\n", ")\n", "\n", "df_train_filled = pd.concat([X_train_filled, y_train], axis=1)\n", "df_train_filled_drifted = df_train_filled[(df_train_filled['Pclass'] > 1) & (df_train_filled['Fare'] > 10)].copy()\n", "\n", "X_train_filled_drifted = df_train_filled_drifted.drop(columns=TARGET)\n", "y_train_filled_drifted = df_train_filled_drifted[TARGET]\n", "\n", "df_test_filled = pd.concat([X_test_filled, y_test], axis=1)\n", "df_test_filled_drifted = df_test_filled[~(df_test_filled['Pclass'] > 1) & (df_test_filled['Fare'] > 10)].copy()\n", "\n", "X_test_filled_drifted = df_test_filled_drifted.drop(columns=TARGET)\n", "y_test_filled_drifted = df_test_filled_drifted[TARGET]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Let´s Try To Fit And Predict \n", "\n", "DriftEstimatorException tells you that there are some data drifts, you can acces to `drifted_columns` attribute to ckeck them, we will do in the next cell" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Drift found in numerical columns check step\n", "Drift found in categorical columns check step\n", "Drift found in discriminative model step\n", "Drift found in pipeline_catboost_drift_checker\n" ] } ], "source": [ "pipeline_catboost_drift_checker.fit(X_train_filled_drifted, y_train_filled_drifted)\n", "\n", "y_score_train = pipeline_catboost_drift_checker.predict_proba(X_train_filled_drifted)[:, 1]\n", "\n", "try:\n", " y_score_test = pipeline_catboost_drift_checker.predict_proba(X_test_filled_drifted)[:, 1]\n", "\n", " auc_train = roc_auc_score(y_true=y_train_filled_drifted, y_score=y_score_train)\n", " auc_test = roc_auc_score(y_true=y_test_filled_drifted, y_score=y_score_test)\n", "\n", " print(f'AUC training data: {auc_train:.2f}')\n", " print(f'AUC testing data: {auc_test:.2f}')\n", "except DriftEstimatorException:\n", " print('Drift found in pipeline_catboost_drift_checker')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# You Can Get Drifted Features From `DriftCheckerEstimator` Object" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Fare, Parch, Age, Embarked, Pclass'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drifted_features = (\n", " pipeline_catboost_drift_checker\n", " .named_steps['driftcheckerestimator']\n", " .get_drifted_features()\n", ")\n", "\n", "drifted_features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# You Can Also Get High Cardinality Features\n", "\n", "None in this case" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "''" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(\n", " pipeline_catboost_drift_checker\n", " .named_steps['driftcheckerestimator']\n", " .get_high_cardinality_features()\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }