{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Test For `DriftChecker`-`pydrift` \n", "\n", "We're going to test how it works with the famous titanic dataset\n", "\n", "# Dependencies" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from sklearn import set_config\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.compose import make_column_transformer\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import OrdinalEncoder\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "from pydrift import DriftChecker\n", "from pydrift.exceptions import ColumnsNotMatchException\n", "from pydrift.constants import PATH_DATA, RANDOM_STATE\n", "\n", "\n", "set_config(display='diagram')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Read Data " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df_titanic = pd.read_csv(PATH_DATA / 'titanic.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Constants " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "DATA_LENGTH = df_titanic.shape[0]\n", "TARGET = 'Survived'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Split\n", "\n", "50% sample will give us a non-drift problem\n", "\n", "We drop Ticket and Cabin features because of cardinality" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "X = df_titanic.drop(columns=['Ticket', 'Cabin', 'PassengerId', 'Name', TARGET])\n", "y = df_titanic[TARGET]\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=.5, random_state=RANDOM_STATE, stratify=y\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test `ColumnsNotMatchException`" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Different columns for left and right dataframes\n", "\n", "Columns in right dataframe but not in left one: Sex\n", "Columns in left dataframe but not in right one: None\n" ] } ], "source": [ "try:\n", " DriftChecker(X_train.drop(columns='Sex'), X_test)\n", "except ColumnsNotMatchException as exception:\n", " print(exception)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Same With Right DataFrame " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Different columns for left and right dataframes\n", "\n", "Columns in right dataframe but not in left one: None\n", "Columns in left dataframe but not in right one: SibSp\n" ] } ], "source": [ "try:\n", " DriftChecker(X_train, X_test.drop(columns='SibSp'))\n", "except ColumnsNotMatchException as exception:\n", " print(exception)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Both Dataframes With Different Columns " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Different columns for left and right dataframes\n", "\n", "Columns in right dataframe but not in left one: Fare\n", "Columns in left dataframe but not in right one: Embarked\n" ] } ], "source": [ "try:\n", " DriftChecker(X_train.drop(columns='Fare'), X_test.drop(columns='Embarked'))\n", "except ColumnsNotMatchException as exception:\n", " print(exception)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test `ml_model_can_discriminate` Feature With Different Model\n", "\n", "You can pass any model to be the discriminative ml model, for example a pipeline with logistic regression" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('columntransformer',\n",
" ColumnTransformer(transformers=[('pipeline',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='most_frequent')),\n",
" ('ordinalencoder',\n",
" OrdinalEncoder())]),\n",
" Index(['Sex', 'Embarked'], dtype='object')),\n",
" ('simpleimputer',\n",
" SimpleImputer(strategy='median'),\n",
" Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object'))])),\n",
" ('logisticregression',\n",
" LogisticRegression(max_iter=1000, random_state=1994))])ColumnTransformer(transformers=[('pipeline',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='most_frequent')),\n",
" ('ordinalencoder',\n",
" OrdinalEncoder())]),\n",
" Index(['Sex', 'Embarked'], dtype='object')),\n",
" ('simpleimputer',\n",
" SimpleImputer(strategy='median'),\n",
" Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object'))])Index(['Sex', 'Embarked'], dtype='object')
SimpleImputer(strategy='most_frequent')
OrdinalEncoder()
Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
SimpleImputer(strategy='median')
LogisticRegression(max_iter=1000, random_state=1994)