{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Test For `DriftCheckerEstimator`-`pydrift` \n", "\n", "We're going to test how it works with the famous titanic dataset\n", "\n", "# Dependencies" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from sklearn import set_config\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.compose import make_column_transformer\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import OrdinalEncoder\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import roc_auc_score\n", "from catboost import CatBoostClassifier\n", "\n", "from pydrift import DriftCheckerEstimator\n", "from pydrift.exceptions import ColumnsNotMatchException\n", "from pydrift.constants import PATH_DATA, RANDOM_STATE\n", "from pydrift.models import cat_features_fillna\n", "from pydrift.exceptions import DriftEstimatorException\n", "\n", "\n", "set_config(display='diagram')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Read Data " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df_titanic = pd.read_csv(PATH_DATA / 'titanic.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Constants " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "DATA_LENGTH = df_titanic.shape[0]\n", "TARGET = 'Survived'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Split\n", "\n", "50% sample will give us a non-drift problem\n", "\n", "We drop Ticket and Cabin features because of cardinality" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "X = df_titanic.drop(columns=['Ticket', 'Cabin', 'PassengerId', 'Name', TARGET])\n", "y = df_titanic[TARGET]\n", "\n", "cat_features = (X\n", " .select_dtypes(include=['category', 'object'])\n", " .columns)\n", "\n", "X_filled = cat_features_fillna(X, cat_features)\n", "\n", "X_train_filled, X_test_filled, y_train, y_test = train_test_split(\n", " X_filled, y, test_size=.5, random_state=RANDOM_STATE, stratify=y\n", ")\n", "\n", "catboost_classifier = CatBoostClassifier(\n", " num_trees=5,\n", " max_depth=3,\n", " cat_features=cat_features,\n", " random_state=RANDOM_STATE,\n", " verbose=False\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Build Pipeline With DriftCheckerEstimator\n", "\n", "Catboost as estimator" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('driftcheckerestimator',\n",
" DriftCheckerEstimator(column_names=Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object'),\n",
" ml_classifier_model=))]) DriftCheckerEstimator(column_names=Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object'),\n",
" ml_classifier_model=) Pipeline(steps=[('columntransformer',\n",
" ColumnTransformer(transformers=[('pipeline',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='most_frequent')),\n",
" ('ordinalencoder',\n",
" OrdinalEncoder())]),\n",
" Index(['Sex', 'Embarked'], dtype='object')),\n",
" ('simpleimputer',\n",
" SimpleImputer(strategy='median'),\n",
" Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object'))])),\n",
" ('driftcheckerestimator',\n",
" DriftCheckerEstimator(column_names=Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object'),\n",
" ml_classifier_model=LogisticRegression(max_iter=1000,\n",
" random_state=1994)))])ColumnTransformer(transformers=[('pipeline',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='most_frequent')),\n",
" ('ordinalencoder',\n",
" OrdinalEncoder())]),\n",
" Index(['Sex', 'Embarked'], dtype='object')),\n",
" ('simpleimputer',\n",
" SimpleImputer(strategy='median'),\n",
" Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object'))])Index(['Sex', 'Embarked'], dtype='object')
SimpleImputer(strategy='most_frequent')
OrdinalEncoder()
Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
SimpleImputer(strategy='median')
DriftCheckerEstimator(column_names=Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object'),\n",
" ml_classifier_model=LogisticRegression(max_iter=1000,\n",
" random_state=1994))LogisticRegression(max_iter=1000, random_state=1994)