{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "\n", "from matplotlib import pyplot as plt\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(32769, 10)\n", "\n", "(58921, 10)\n", "\n" ] } ], "source": [ "train_df = pd.read_csv('/media/lvision/Sabrent/kaggle/2013/amazon-employee-access-challenge/train.csv')\n", "test_df = pd.read_csv('/media/lvision/Sabrent/kaggle/2013/amazon-employee-access-challenge/test.csv')\n", "print(train_df.shape)\n", "print(type(train_df))\n", "print(test_df.shape)\n", "print(type(test_df))\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ACTIONRESOURCEMGR_IDROLE_ROLLUP_1ROLE_ROLLUP_2ROLE_DEPTNAMEROLE_TITLEROLE_FAMILY_DESCROLE_FAMILYROLE_CODE
013935385475117961118300123472117905117906290919117908
11171831540117961118343123125118536118536308574118539
21367241445711821911822011788411787926795219721117880
31361355396117961118343119993118321240983290919118322
4142680590511792911793011956911932312393219793119325
\n", "
" ], "text/plain": [ " ACTION RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_ROLLUP_2 ROLE_DEPTNAME \\\n", "0 1 39353 85475 117961 118300 123472 \n", "1 1 17183 1540 117961 118343 123125 \n", "2 1 36724 14457 118219 118220 117884 \n", "3 1 36135 5396 117961 118343 119993 \n", "4 1 42680 5905 117929 117930 119569 \n", "\n", " ROLE_TITLE ROLE_FAMILY_DESC ROLE_FAMILY ROLE_CODE \n", "0 117905 117906 290919 117908 \n", "1 118536 118536 308574 118539 \n", "2 117879 267952 19721 117880 \n", "3 118321 240983 290919 118322 \n", "4 119323 123932 19793 119325 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idRESOURCEMGR_IDROLE_ROLLUP_1ROLE_ROLLUP_2ROLE_DEPTNAMEROLE_TITLEROLE_FAMILY_DESCROLE_FAMILYROLE_CODE
01787667273411807911808011787811787911817719721117880
12406444378117961118327118507118863122008118398118865
23754432395117961118300119488118172301534249618118175
344321919986117961118225118403120773136187118960120774
454209350015117961118343119598118422300136118424118425
\n", "
" ], "text/plain": [ " id RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_ROLLUP_2 ROLE_DEPTNAME \\\n", "0 1 78766 72734 118079 118080 117878 \n", "1 2 40644 4378 117961 118327 118507 \n", "2 3 75443 2395 117961 118300 119488 \n", "3 4 43219 19986 117961 118225 118403 \n", "4 5 42093 50015 117961 118343 119598 \n", "\n", " ROLE_TITLE ROLE_FAMILY_DESC ROLE_FAMILY ROLE_CODE \n", "0 117879 118177 19721 117880 \n", "1 118863 122008 118398 118865 \n", "2 118172 301534 249618 118175 \n", "3 120773 136187 118960 120774 \n", "4 118422 300136 118424 118425 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "y = train_df['ACTION']\n", "X = train_df.drop(columns='ACTION') # or X = train_df.drop('ACTION', axis=1)\n", "\n", "X_test = test_df.drop(columns='id')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "SEED = 1\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=SEED)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# From now on, we try different packages\n", "# including: catboost, xgboost, lightgbm, h2o, etc." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import catboost as ctb" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "65d847025e294ae2997a373903dc358c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Learning rate set to 0.069882\n", "0:\ttest: 0.5400959\tbest: 0.5400959 (0)\ttotal: 53.7ms\tremaining: 53.6s\n", "200:\ttest: 0.8020842\tbest: 0.8020842 (200)\ttotal: 1.05s\tremaining: 4.17s\n", "400:\ttest: 0.8237941\tbest: 0.8237941 (400)\ttotal: 2.03s\tremaining: 3.03s\n", "600:\ttest: 0.8328464\tbest: 0.8330283 (598)\ttotal: 3.01s\tremaining: 2s\n", "800:\ttest: 0.8366271\tbest: 0.8370599 (785)\ttotal: 4.01s\tremaining: 997ms\n", "999:\ttest: 0.8417832\tbest: 0.8417832 (999)\ttotal: 5s\tremaining: 0us\n", "\n", "bestTest = 0.8417831567\n", "bestIteration = 999\n", "\n", "CPU times: user 1min 59s, sys: 11.3 s, total: 2min 10s\n", "Wall time: 5.1 s\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "params = {'loss_function':'Logloss', # objective function\n", " 'eval_metric':'AUC', # metric\n", " 'verbose': 200, # output to stdout info about training process every 200 iterations\n", " 'random_seed': SEED\n", " }\n", "cbc_1 = ctb.CatBoostClassifier(**params)\n", "cbc_1.fit(X_train, y_train, # data to train on (required parameters, unless we provide X as a pool object, will be shown below)\n", " eval_set=(X_valid, y_valid), # data to validate on\n", " use_best_model=True, # True if we don't want to save trees created after iteration with the best validation score\n", " plot=True # True for visualization of the training process (it is not shown in a published kernel - try executing this code)\n", " );" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0, 1, 2, 3, 4, 5, 6, 7, 8]\n" ] } ], "source": [ "cat_features = list(range(X.shape[1]))\n", "print(cat_features)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0, 1, 2, 3, 4, 5, 6, 7, 8]\n" ] } ], "source": [ "condition = True # here we specify what condition should be satisfied only by the names of categorical features\n", "cat_features_names = [col for col in X.columns if condition]\n", "cat_features = [X.columns.get_loc(col) for col in cat_features_names]\n", "print(cat_features)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "45657692d7724ffb91700b3bd11e56ed", "version_major": 2, "version_minor": 0 }, "text/plain": [ "MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Learning rate set to 0.069882\n", "0:\ttest: 0.5637606\tbest: 0.5637606 (0)\ttotal: 29.9ms\tremaining: 29.9s\n", "200:\ttest: 0.8959353\tbest: 0.8959406 (199)\ttotal: 5s\tremaining: 19.9s\n", "400:\ttest: 0.8985289\tbest: 0.8990341 (378)\ttotal: 10.6s\tremaining: 15.8s\n", "600:\ttest: 0.9005314\tbest: 0.9006239 (594)\ttotal: 16.2s\tremaining: 10.8s\n", "800:\ttest: 0.9003434\tbest: 0.9014996 (744)\ttotal: 21.7s\tremaining: 5.38s\n", "999:\ttest: 0.8999324\tbest: 0.9014996 (744)\ttotal: 26.3s\tremaining: 0us\n", "\n", "bestTest = 0.9014995851\n", "bestIteration = 744\n", "\n", "Shrink model to first 745 iterations.\n", "CPU times: user 11min 50s, sys: 32.5 s, total: 12min 23s\n", "Wall time: 26.5 s\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "params = {'loss_function':'Logloss',\n", " 'eval_metric':'AUC',\n", " 'cat_features': cat_features,\n", " 'verbose': 200,\n", " 'random_seed': SEED\n", " }\n", "cbc_2 = ctb.CatBoostClassifier(**params)\n", "cbc_2.fit(X_train, y_train,\n", " eval_set=(X_valid, y_valid),\n", " use_best_model=True,\n", " plot=True\n", " );" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "90ef940925ff43c3835084146f73ccea", "version_major": 2, "version_minor": 0 }, "text/plain": [ "MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Learning rate set to 0.069882\n", "0:\ttest: 0.5637606\tbest: 0.5637606 (0)\ttotal: 18.8ms\tremaining: 18.8s\n", "200:\ttest: 0.8959353\tbest: 0.8959406 (199)\ttotal: 5.08s\tremaining: 20.2s\n", "400:\ttest: 0.8985289\tbest: 0.8990341 (378)\ttotal: 10.6s\tremaining: 15.9s\n", "600:\ttest: 0.9005314\tbest: 0.9006239 (594)\ttotal: 16.3s\tremaining: 10.8s\n", "800:\ttest: 0.9003434\tbest: 0.9014996 (744)\ttotal: 21.9s\tremaining: 5.43s\n", "Stopped by overfitting detector (200 iterations wait)\n", "\n", "bestTest = 0.9014995851\n", "bestIteration = 744\n", "\n", "Shrink model to first 745 iterations.\n", "CPU times: user 11min 39s, sys: 33.3 s, total: 12min 12s\n", "Wall time: 25.5 s\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "params = {'loss_function':'Logloss',\n", " 'eval_metric':'AUC',\n", " 'cat_features': cat_features,\n", " 'early_stopping_rounds': 200,\n", " 'verbose': 200,\n", " 'random_seed': SEED\n", " }\n", "cbc_2 = ctb.CatBoostClassifier(**params)\n", "cbc_2.fit(X_train, y_train, \n", " eval_set=(X_valid, y_valid), \n", " use_best_model=True, \n", " plot=True\n", " );\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fca6c6a6371e4eaf8f846498e00b6716", "version_major": 2, "version_minor": 0 }, "text/plain": [ "MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Learning rate set to 0.054241\n", "0:\tlearn: 0.6174190\ttest: 0.6184174\tbest: 0.6184174 (0)\ttotal: 33.4ms\tremaining: 33.3s\n", "200:\tlearn: 0.8536408\ttest: 0.8762504\tbest: 0.8762504 (200)\ttotal: 6.53s\tremaining: 26s\n", "400:\tlearn: 0.8636365\ttest: 0.8807679\tbest: 0.8807842 (398)\ttotal: 13s\tremaining: 19.5s\n", "600:\tlearn: 0.8688402\ttest: 0.8825358\tbest: 0.8826348 (587)\ttotal: 19.5s\tremaining: 12.9s\n", "800:\tlearn: 0.8726251\ttest: 0.8827334\tbest: 0.8829701 (760)\ttotal: 25.9s\tremaining: 6.44s\n", "999:\tlearn: 0.8763252\ttest: 0.8832531\tbest: 0.8835157 (986)\ttotal: 32.7s\tremaining: 0us\n", "bestTest = 0.8835157454\n", "bestIteration = 986\n", "Shrink model to first 987 iterations.\n", "CPU times: user 42.1 s, sys: 4.77 s, total: 46.8 s\n", "Wall time: 33.2 s\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "params = {'loss_function':'Logloss',\n", " 'eval_metric':'AUC',\n", " 'cat_features': cat_features,\n", " 'task_type': 'GPU',\n", " 'verbose': 200,\n", " 'random_seed': SEED\n", " }\n", "cbc_3 = ctb.CatBoostClassifier(**params)\n", "cbc_3.fit(X_train, y_train,\n", " eval_set=(X_valid, y_valid), \n", " use_best_model=True,\n", " plot=True\n", " );" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "304ea78e73654e3cbb24bb980f297d15", "version_major": 2, "version_minor": 0 }, "text/plain": [ "MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Learning rate set to 0.054241\n", "0:\tlearn: 0.6174190\ttest: 0.6184174\tbest: 0.6184174 (0)\ttotal: 26.4ms\tremaining: 26.3s\n", "200:\tlearn: 0.8536408\ttest: 0.8762504\tbest: 0.8762504 (200)\ttotal: 6.61s\tremaining: 26.3s\n", "400:\tlearn: 0.8636365\ttest: 0.8807679\tbest: 0.8807842 (398)\ttotal: 13.1s\tremaining: 19.6s\n", "600:\tlearn: 0.8688402\ttest: 0.8825358\tbest: 0.8826348 (587)\ttotal: 19.5s\tremaining: 13s\n", "800:\tlearn: 0.8726251\ttest: 0.8827334\tbest: 0.8829701 (760)\ttotal: 26s\tremaining: 6.45s\n", "999:\tlearn: 0.8763252\ttest: 0.8832522\tbest: 0.8835146 (986)\ttotal: 32.4s\tremaining: 0us\n", "bestTest = 0.8835146129\n", "bestIteration = 986\n", "Shrink model to first 987 iterations.\n", "CPU times: user 42.5 s, sys: 4.17 s, total: 46.7 s\n", "Wall time: 32.9 s\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "params = {'loss_function':'Logloss',\n", " 'eval_metric':'AUC',\n", " 'cat_features': cat_features,\n", " 'task_type': 'GPU',\n", " 'border_count': 32,\n", " 'verbose': 200,\n", " 'random_seed': SEED\n", " }\n", "cbc_4 = ctb.CatBoostClassifier(**params)\n", "cbc_4.fit(X_train, y_train, \n", " eval_set=(X_valid, y_valid), \n", " use_best_model=True, \n", " plot=True\n", " );" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "np.random.seed(SEED)\n", "noise_cols = [f'noise_{i}' for i in range(5)]\n", "for col in noise_cols:\n", " X_train[col] = y_train * np.random.rand(X_train.shape[0])\n", " X_valid[col] = np.random.rand(X_valid.shape[0])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RESOURCEMGR_IDROLE_ROLLUP_1ROLE_ROLLUP_2ROLE_DEPTNAMEROLE_TITLEROLE_FAMILY_DESCROLE_FAMILYROLE_CODEnoise_0noise_1noise_2noise_3noise_4
167732779813501179611180521229381179051179062909191179080.4170220.0978500.6656000.9790250.491624
234918070145711179611182251199241186852794433085741186870.7203240.8559000.3117630.9293460.391708
327313403951131179611183001198901194331336861184241194350.0001140.2878380.8966240.7040500.606467
78554208547331182901182911201261189801662031182951189820.3023330.2643200.4821950.0284930.182570
164751635860461179611184461203173070243064041183311183320.1467560.0228760.0093070.7267500.623357
\n", "
" ], "text/plain": [ " RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_ROLLUP_2 ROLE_DEPTNAME \\\n", "16773 27798 1350 117961 118052 122938 \n", "23491 80701 4571 117961 118225 119924 \n", "32731 34039 5113 117961 118300 119890 \n", "7855 42085 4733 118290 118291 120126 \n", "16475 16358 6046 117961 118446 120317 \n", "\n", " ROLE_TITLE ROLE_FAMILY_DESC ROLE_FAMILY ROLE_CODE noise_0 \\\n", "16773 117905 117906 290919 117908 0.417022 \n", "23491 118685 279443 308574 118687 0.720324 \n", "32731 119433 133686 118424 119435 0.000114 \n", "7855 118980 166203 118295 118982 0.302333 \n", "16475 307024 306404 118331 118332 0.146756 \n", "\n", " noise_1 noise_2 noise_3 noise_4 \n", "16773 0.097850 0.665600 0.979025 0.491624 \n", "23491 0.855900 0.311763 0.929346 0.391708 \n", "32731 0.287838 0.896624 0.704050 0.606467 \n", "7855 0.264320 0.482195 0.028493 0.182570 \n", "16475 0.022876 0.009307 0.726750 0.623357 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0989b54b37b94b029a8bad387709377a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Learning rate set to 0.069882\n", "0:\ttest: 0.4990944\tbest: 0.4990944 (0)\ttotal: 12.7ms\tremaining: 12.7s\n", "200:\ttest: 0.5831370\tbest: 0.5894476 (7)\ttotal: 2.58s\tremaining: 10.3s\n", "400:\ttest: 0.5831376\tbest: 0.5894476 (7)\ttotal: 4.77s\tremaining: 7.13s\n", "600:\ttest: 0.5831376\tbest: 0.5894476 (7)\ttotal: 7.19s\tremaining: 4.77s\n", "800:\ttest: 0.5831378\tbest: 0.5894476 (7)\ttotal: 9.2s\tremaining: 2.29s\n", "999:\ttest: 0.5831381\tbest: 0.5894476 (7)\ttotal: 10.9s\tremaining: 0us\n", "\n", "bestTest = 0.5894475816\n", "bestIteration = 7\n", "\n", "Shrink model to first 8 iterations.\n", "CPU times: user 4min 25s, sys: 13.9 s, total: 4min 39s\n", "Wall time: 11.1 s\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "params = {'loss_function':'Logloss',\n", " 'eval_metric':'AUC',\n", " 'cat_features': cat_features,\n", " 'verbose': 200,\n", " 'random_seed': SEED\n", " }\n", "cbc_5 = ctb.CatBoostClassifier(**params)\n", "cbc_5.fit(X_train, y_train, \n", " eval_set=(X_valid, y_valid), \n", " use_best_model=True, \n", " plot=True\n", " );" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[9, 10, 11, 12, 13]\n" ] } ], "source": [ "ignored_features = list(range(X_train.shape[1] - 5, X_train.shape[1]))\n", "print(ignored_features)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "25685cacd69345c78b865ceb0a659e05", "version_major": 2, "version_minor": 0 }, "text/plain": [ "MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Learning rate set to 0.069882\n", "0:\ttest: 0.5637606\tbest: 0.5637606 (0)\ttotal: 19.2ms\tremaining: 19.2s\n", "200:\ttest: 0.8959353\tbest: 0.8959406 (199)\ttotal: 5.03s\tremaining: 20s\n", "400:\ttest: 0.8985289\tbest: 0.8990341 (378)\ttotal: 10.5s\tremaining: 15.8s\n", "600:\ttest: 0.9005314\tbest: 0.9006239 (594)\ttotal: 16.2s\tremaining: 10.8s\n", "800:\ttest: 0.9003434\tbest: 0.9014996 (744)\ttotal: 21.7s\tremaining: 5.38s\n", "Stopped by overfitting detector (200 iterations wait)\n", "\n", "bestTest = 0.9014995851\n", "bestIteration = 744\n", "\n", "Shrink model to first 745 iterations.\n", "CPU times: user 11min 46s, sys: 34.3 s, total: 12min 21s\n", "Wall time: 25.4 s\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "params = {'loss_function':'Logloss',\n", " 'eval_metric':'AUC',\n", " 'cat_features': cat_features,\n", " 'ignored_features': ignored_features,\n", " 'early_stopping_rounds': 200,\n", " 'verbose': 200,\n", " 'random_seed': SEED\n", " }\n", "cbc_6 = ctb.CatBoostClassifier(**params)\n", "cbc_6.fit(X_train, y_train, \n", " eval_set=(X_valid, y_valid), \n", " use_best_model=True, \n", " plot=True\n", " );" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "X_train = X_train.drop(columns=noise_cols)\n", "X_valid = X_valid.drop(columns=noise_cols)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RESOURCEMGR_IDROLE_ROLLUP_1ROLE_ROLLUP_2ROLE_DEPTNAMEROLE_TITLEROLE_FAMILY_DESCROLE_FAMILYROLE_CODE
16773277981350117961118052122938117905117906290919117908
23491807014571117961118225119924118685279443308574118687
32731340395113117961118300119890119433133686118424119435
7855420854733118290118291120126118980166203118295118982
16475163586046117961118446120317307024306404118331118332
\n", "
" ], "text/plain": [ " RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_ROLLUP_2 ROLE_DEPTNAME \\\n", "16773 27798 1350 117961 118052 122938 \n", "23491 80701 4571 117961 118225 119924 \n", "32731 34039 5113 117961 118300 119890 \n", "7855 42085 4733 118290 118291 120126 \n", "16475 16358 6046 117961 118446 120317 \n", "\n", " ROLE_TITLE ROLE_FAMILY_DESC ROLE_FAMILY ROLE_CODE \n", "16773 117905 117906 290919 117908 \n", "23491 118685 279443 308574 118687 \n", "32731 119433 133686 118424 119435 \n", "7855 118980 166203 118295 118982 \n", "16475 307024 306404 118331 118332 " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.head()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "train_data = ctb.Pool(data=X_train,\n", " label=y_train,\n", " cat_features=cat_features\n", " )\n", "\n", "valid_data = ctb.Pool(data=X_valid,\n", " label=y_valid,\n", " cat_features=cat_features\n", " )" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "823cbebb18b3407b856e04b0f831bf11", "version_major": 2, "version_minor": 0 }, "text/plain": [ "MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Learning rate set to 0.069882\n", "0:\ttest: 0.5637606\tbest: 0.5637606 (0)\ttotal: 16.6ms\tremaining: 16.6s\n", "200:\ttest: 0.8959353\tbest: 0.8959406 (199)\ttotal: 5.04s\tremaining: 20s\n", "400:\ttest: 0.8985289\tbest: 0.8990341 (378)\ttotal: 10.6s\tremaining: 15.8s\n", "600:\ttest: 0.9005314\tbest: 0.9006239 (594)\ttotal: 16.3s\tremaining: 10.8s\n", "800:\ttest: 0.9003434\tbest: 0.9014996 (744)\ttotal: 21.8s\tremaining: 5.4s\n", "Stopped by overfitting detector (200 iterations wait)\n", "\n", "bestTest = 0.9014995851\n", "bestIteration = 744\n", "\n", "Shrink model to first 745 iterations.\n", "CPU times: user 11min 48s, sys: 34.4 s, total: 12min 22s\n", "Wall time: 25.3 s\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "params = {'loss_function':'Logloss',\n", " 'eval_metric':'AUC',\n", "# 'cat_features': cat_features, # we don't need to specify this parameter as \n", "# pool object contains info about categorical features\n", " 'early_stopping_rounds': 200,\n", " 'verbose': 200,\n", " 'random_seed': SEED\n", " }\n", "\n", "cbc_7 = ctb.CatBoostClassifier(**params)\n", "cbc_7.fit(train_data, # instead of X_train, y_train\n", " eval_set=valid_data, # instead of (X_valid, y_valid)\n", " use_best_model=True, \n", " plot=True\n", " );" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2c50fefd00644e2785a0cc3df1922786", "version_major": 2, "version_minor": 0 }, "text/plain": [ "MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Training on fold [0/4]\n", "0:\ttest: 0.5000000\tbest: 0.5000000 (0)\ttotal: 20.9ms\tremaining: 20.9s\n", "200:\ttest: 0.8938231\tbest: 0.8938231 (200)\ttotal: 9.4s\tremaining: 37.4s\n", "400:\ttest: 0.8976768\tbest: 0.8976768 (400)\ttotal: 20.5s\tremaining: 30.7s\n", "600:\ttest: 0.9016370\tbest: 0.9016602 (599)\ttotal: 31.9s\tremaining: 21.2s\n", "800:\ttest: 0.9027846\tbest: 0.9027909 (799)\ttotal: 43.4s\tremaining: 10.8s\n", "999:\ttest: 0.9035137\tbest: 0.9035137 (999)\ttotal: 54.9s\tremaining: 0us\n", "\n", "bestTest = 0.9035137273\n", "bestIteration = 999\n", "\n", "Training on fold [1/4]\n", "0:\ttest: 0.5000000\tbest: 0.5000000 (0)\ttotal: 23ms\tremaining: 23s\n", "200:\ttest: 0.8827467\tbest: 0.8832037 (146)\ttotal: 9.55s\tremaining: 38s\n", "400:\ttest: 0.8844741\tbest: 0.8844968 (397)\ttotal: 20.8s\tremaining: 31s\n", "600:\ttest: 0.8860410\tbest: 0.8864452 (506)\ttotal: 32.8s\tremaining: 21.8s\n", "800:\ttest: 0.8864595\tbest: 0.8864742 (799)\ttotal: 44.7s\tremaining: 11.1s\n", "999:\ttest: 0.8850072\tbest: 0.8864950 (822)\ttotal: 56.4s\tremaining: 0us\n", "\n", "bestTest = 0.8864949928\n", "bestIteration = 822\n", "\n", "Training on fold [2/4]\n", "0:\ttest: 0.5000000\tbest: 0.5000000 (0)\ttotal: 21.1ms\tremaining: 21.1s\n", "200:\ttest: 0.8794576\tbest: 0.8794576 (200)\ttotal: 9.18s\tremaining: 36.5s\n", "400:\ttest: 0.8834346\tbest: 0.8834346 (400)\ttotal: 20.2s\tremaining: 30.2s\n", "600:\ttest: 0.8866012\tbest: 0.8866091 (599)\ttotal: 31.4s\tremaining: 20.8s\n", "800:\ttest: 0.8891931\tbest: 0.8891961 (797)\ttotal: 42.8s\tremaining: 10.6s\n", "999:\ttest: 0.8906350\tbest: 0.8907859 (990)\ttotal: 54s\tremaining: 0us\n", "\n", "bestTest = 0.8907858827\n", "bestIteration = 990\n", "\n", "Training on fold [3/4]\n", "0:\ttest: 0.5000000\tbest: 0.5000000 (0)\ttotal: 20.1ms\tremaining: 20.1s\n", "200:\ttest: 0.8848750\tbest: 0.8848750 (200)\ttotal: 9.73s\tremaining: 38.7s\n", "400:\ttest: 0.8886395\tbest: 0.8886395 (400)\ttotal: 21.8s\tremaining: 32.6s\n", "600:\ttest: 0.8904434\tbest: 0.8904442 (599)\ttotal: 33.8s\tremaining: 22.5s\n", "800:\ttest: 0.8909729\tbest: 0.8913274 (686)\ttotal: 45.7s\tremaining: 11.3s\n", "999:\ttest: 0.8898141\tbest: 0.8913274 (686)\ttotal: 57.5s\tremaining: 0us\n", "\n", "bestTest = 0.8913273864\n", "bestIteration = 686\n", "\n", "CPU times: user 2h 7min 1s, sys: 5min 46s, total: 2h 12min 48s\n", "Wall time: 3min 43s\n" ] } ], "source": [ "%%time\n", "\n", "params = {'loss_function':'Logloss',\n", " 'eval_metric':'AUC',\n", " 'verbose': 200,\n", " 'random_seed': SEED\n", " }\n", "\n", "all_train_data = ctb.Pool(data=X,\n", " label=y,\n", " cat_features=cat_features\n", " )\n", "\n", "scores = ctb.cv(pool=all_train_data,\n", " params=params, \n", " fold_count=4,\n", " seed=SEED, \n", " shuffle=True,\n", " stratified=True, # if True the folds are made by preserving the percentage of samples for each class\n", " plot=True\n", " )" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Feature IdImportances
0RESOURCE18.981667
1ROLE_DEPTNAME16.202565
2ROLE_ROLLUP_214.275839
3MGR_ID14.208860
4ROLE_FAMILY_DESC9.218010
5ROLE_TITLE7.735600
6ROLE_FAMILY7.057141
7ROLE_ROLLUP_16.572883
8ROLE_CODE5.747437
\n", "
" ], "text/plain": [ " Feature Id Importances\n", "0 RESOURCE 18.981667\n", "1 ROLE_DEPTNAME 16.202565\n", "2 ROLE_ROLLUP_2 14.275839\n", "3 MGR_ID 14.208860\n", "4 ROLE_FAMILY_DESC 9.218010\n", "5 ROLE_TITLE 7.735600\n", "6 ROLE_FAMILY 7.057141\n", "7 ROLE_ROLLUP_1 6.572883\n", "8 ROLE_CODE 5.747437" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cbc_7.get_feature_importance(prettified=True)\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Feature IdImportances
0RESOURCE18.981667
1ROLE_DEPTNAME16.202565
2ROLE_ROLLUP_214.275839
3MGR_ID14.208860
4ROLE_FAMILY_DESC9.218010
5ROLE_TITLE7.735600
6ROLE_FAMILY7.057141
7ROLE_ROLLUP_16.572883
8ROLE_CODE5.747437
\n", "
" ], "text/plain": [ " Feature Id Importances\n", "0 RESOURCE 18.981667\n", "1 ROLE_DEPTNAME 16.202565\n", "2 ROLE_ROLLUP_2 14.275839\n", "3 MGR_ID 14.208860\n", "4 ROLE_FAMILY_DESC 9.218010\n", "5 ROLE_TITLE 7.735600\n", "6 ROLE_FAMILY 7.057141\n", "7 ROLE_ROLLUP_1 6.572883\n", "8 ROLE_CODE 5.747437" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_importance_df = pd.DataFrame(cbc_7.get_feature_importance(prettified=True), columns=['Feature Id', 'Importances'])\n", "feature_importance_df" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(12, 6));\n", "sns.barplot(x=\"Importances\", y=\"Feature Id\", data=feature_importance_df);\n", "plt.title('CatBoost features importance:');" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", "
\n", " Visualization omitted, Javascript library not loaded!
\n", " Have you run `initjs()` in this notebook? If this notebook was from another\n", " user you must also trust this notebook (File -> Trust notebook). If you are viewing\n", " this notebook on github the Javascript has been stripped for security. If you are using\n", " JupyterLab this error is because a JupyterLab extension has not yet been written.\n", "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import shap\n", "explainer = shap.TreeExplainer(cbc_7) # insert your model\n", "shap_values = explainer.shap_values(train_data) # insert your train Pool object\n", "\n", "shap.initjs()\n", "shap.force_plot(explainer.expected_value, shap_values[:100,:], X_train.iloc[:100,:])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "shap.summary_plot(shap_values, X_train)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Learning rate set to 0.069882\n", "0:\ttest: 0.5797111\tbest: 0.5797111 (0)\ttotal: 18.7ms\tremaining: 18.7s\n", "200:\ttest: 0.8638646\tbest: 0.8638646 (200)\ttotal: 5.21s\tremaining: 20.7s\n", "400:\ttest: 0.8678851\tbest: 0.8679522 (398)\ttotal: 10.7s\tremaining: 16s\n", "600:\ttest: 0.8701402\tbest: 0.8701489 (589)\ttotal: 16.4s\tremaining: 10.9s\n", "800:\ttest: 0.8708947\tbest: 0.8715082 (745)\ttotal: 22.2s\tremaining: 5.51s\n", "999:\ttest: 0.8708311\tbest: 0.8721216 (875)\ttotal: 27.9s\tremaining: 0us\n", "\n", "bestTest = 0.8721216295\n", "bestIteration = 875\n", "\n", "Shrink model to first 876 iterations.\n", "Learning rate set to 0.069883\n", "0:\ttest: 0.5000000\tbest: 0.5000000 (0)\ttotal: 5.46ms\tremaining: 5.45s\n", "200:\ttest: 0.8957267\tbest: 0.8957267 (200)\ttotal: 5.18s\tremaining: 20.6s\n", "400:\ttest: 0.9009627\tbest: 0.9009908 (396)\ttotal: 11s\tremaining: 16.4s\n", "600:\ttest: 0.9022419\tbest: 0.9023966 (565)\ttotal: 16.8s\tremaining: 11.1s\n", "800:\ttest: 0.9020686\tbest: 0.9023966 (565)\ttotal: 22.6s\tremaining: 5.61s\n", "999:\ttest: 0.9010471\tbest: 0.9023966 (565)\ttotal: 28.3s\tremaining: 0us\n", "\n", "bestTest = 0.9023966387\n", "bestIteration = 565\n", "\n", "Shrink model to first 566 iterations.\n", "Learning rate set to 0.069883\n", "0:\ttest: 0.5000000\tbest: 0.5000000 (0)\ttotal: 6.12ms\tremaining: 6.11s\n", "200:\ttest: 0.9046922\tbest: 0.9051553 (194)\ttotal: 5.24s\tremaining: 20.8s\n", "400:\ttest: 0.9034313\tbest: 0.9052665 (233)\ttotal: 10.9s\tremaining: 16.3s\n", "600:\ttest: 0.9019832\tbest: 0.9052665 (233)\ttotal: 16.6s\tremaining: 11s\n", "800:\ttest: 0.8998534\tbest: 0.9052665 (233)\ttotal: 22.6s\tremaining: 5.61s\n", "999:\ttest: 0.8976830\tbest: 0.9052665 (233)\ttotal: 28.4s\tremaining: 0us\n", "\n", "bestTest = 0.9052665258\n", "bestIteration = 233\n", "\n", "Shrink model to first 234 iterations.\n", "Learning rate set to 0.069883\n", "0:\ttest: 0.5000000\tbest: 0.5000000 (0)\ttotal: 5.75ms\tremaining: 5.75s\n", "200:\ttest: 0.8932656\tbest: 0.8932656 (200)\ttotal: 5.13s\tremaining: 20.4s\n", "400:\ttest: 0.8956892\tbest: 0.8957070 (398)\ttotal: 10.8s\tremaining: 16.2s\n", "600:\ttest: 0.8978707\tbest: 0.8980333 (593)\ttotal: 16.3s\tremaining: 10.8s\n", "800:\ttest: 0.8984955\tbest: 0.8986194 (758)\ttotal: 22s\tremaining: 5.47s\n", "999:\ttest: 0.8985830\tbest: 0.8990488 (913)\ttotal: 27.8s\tremaining: 0us\n", "\n", "bestTest = 0.8990488015\n", "bestIteration = 913\n", "\n", "Shrink model to first 914 iterations.\n", "CV mean: 0.8947, CV std: 0.0132\n", "CPU times: user 52min 54s, sys: 2min 39s, total: 55min 34s\n", "Wall time: 1min 53s\n" ] } ], "source": [ "%%time\n", "\n", "from sklearn.model_selection import StratifiedKFold\n", "\n", "n_fold = 4 # amount of data folds\n", "folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=SEED)\n", "\n", "params = {'loss_function':'Logloss',\n", " 'eval_metric':'AUC',\n", " 'verbose': 200,\n", " 'random_seed': SEED\n", " }\n", "\n", "test_data = ctb.Pool(data=X_test,\n", " cat_features=cat_features)\n", "\n", "scores = []\n", "prediction = np.zeros(X_test.shape[0])\n", "for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):\n", " \n", " X_train, X_valid = X.iloc[train_index], X.iloc[valid_index] # train and validation data splits\n", " y_train, y_valid = y[train_index], y[valid_index]\n", " \n", " train_data = ctb.Pool(data=X_train, \n", " label=y_train,\n", " cat_features=cat_features)\n", " valid_data = ctb.Pool(data=X_valid, \n", " label=y_valid,\n", " cat_features=cat_features)\n", " \n", " model = ctb.CatBoostClassifier(**params)\n", " model.fit(train_data,\n", " eval_set=valid_data, \n", " use_best_model=True\n", " )\n", " \n", " score = model.get_best_score()['validation']['AUC']\n", " scores.append(score)\n", "\n", " y_pred = model.predict_proba(test_data)[:, 1]\n", " prediction += y_pred\n", "\n", "prediction /= n_fold\n", "print('CV mean: {:.4f}, CV std: {:.4f}'.format(np.mean(scores), np.std(scores)))" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saving submission file as: catboost_submission.csv\n" ] } ], "source": [ "sub = pd.read_csv('/media/lvision/Sabrent/kaggle/2013/amazon-employee-access-challenge/sampleSubmission.csv')\n", "sub['Action'] = prediction\n", "sub_name = 'catboost_submission.csv'\n", "sub.to_csv(sub_name, index=False)\n", "\n", "print(f'Saving submission file as: {sub_name}')" ] } ], "metadata": { "interpreter": { "hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 2 }