## Import Data and Libraries

In [1]:
!git clone https://github.com/laura-health/cbms2020/
!pip install catboost
!pip install lightgbm
!pip install missingpy

Cloning into 'cbms2020'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 9 (delta 2), reused 4 (delta 0), pack-reused 0[K
Unpacking objects: 100% (9/9), done.
Collecting catboost
[?25l Downloading https://files.pythonhosted.org/packages/94/ec/12b9a42b2ea7dfe5b602f235692ab2b61ee1334ff34334a15902272869e8/catboost-0.22-cp36-none-manylinux1_x86_64.whl (64.4MB)
[K |████████████████████████████████| 64.4MB 41kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.22
Collecting missingpy
[?25l Downloading https://files.pythonhosted.org/packages/b5/be/998d04d27054b58f0974b5f09f8457778a0a72d4355e0b7ae877b6cfb850/missingpy-0.2.0-py3-none-any.whl (49kB)
[K |████████████████████████████████| 51kB 4.0MB/s 
[?25hInstalling collected packages: missingpy
Successfully installed missingpy-0.2.0


## Load Data and Libraries

In [0]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from missingpy import MissForest
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, f1_score
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')

In [20]:
dataset = pd.read_csv("cbms2020/heg_sample_data.csv") #normalized dataset
dataset.drop('Unnamed: 0', axis=1, inplace=True)
dataset.shape, dataset.columns

((13652, 72), Index(['days_from_entrance', 'age', 'document.sexo', 'UTI',
 'delta_collect_timestamp_t-t1', 'delta_collect_timestamp_t1-t2',
 'delta_collect_timestamp_t2-t3', 'delta_collect_timestamp_t3-t4',
 'document.freq_cardiaca(t)', 'document.freq_cardiaca(t-1)',
 'document.freq_cardiaca(t-2)', 'document.freq_cardiaca(t-3)',
 'document.freq_cardiaca(t-4)', 'document.freq_respiratoria(t)',
 'document.freq_respiratoria(t-1)', 'document.freq_respiratoria(t-2)',
 'document.freq_respiratoria(t-3)', 'document.freq_respiratoria(t-4)',
 'document.glicemia_capilar(t)', 'document.glicemia_capilar(t-1)',
 'document.glicemia_capilar(t-2)', 'document.glicemia_capilar(t-3)',
 'document.glicemia_capilar(t-4)', 'document.pa_diastolica(t)',
 'document.pa_diastolica(t-1)', 'document.pa_diastolica(t-2)',
 'document.pa_diastolica(t-3)', 'document.pa_diastolica(t-4)',
 'document.pa_sistolica(t)', 'document.pa_sistolica(t-1)',
 'document.pa_sistolica(t-2)', 'document.pa_sistolica(t-3)',
 'document.pa_si

## Setup Expetiments

In [0]:
X = dataset.drop(["outcome"], axis = 1)
Y = dataset["outcome"]

In [0]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)

classifiers = {
 'XGBoost' : XGBClassifier(learning_rate=0.1, n_estimators=100,random_state=7, tree_method='gpu_hist'),
 'LogReg': LogisticRegression(solver='liblinear', multi_class='ovr'),
 'D.Tree': DecisionTreeClassifier(),
 'RForest': RandomForestClassifier(n_estimators = 50),
 'CatBoos': CatBoostClassifier(learning_rate=0.1,n_estimators=100,random_state=7,task_type='GPU',verbose = False),
 'Naive': GaussianNB(),
 'Light': lgb.LGBMClassifier()
}

## Run Basic Experiments

In [8]:
for c in classifiers:
 start = time.time()
 model = classifiers[c]
 scores = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
 scores_f1 = cross_val_score(model, X, Y, cv=kfold, scoring='f1')
 print (c + '\t', round(scores.mean(),4), '(' + str(round(scores_f1.mean(),4)) + ')', round(time.time() - start,2), 's')

XGBoost	 0.9076 (0.563) 6.44 s
LogReg	 0.8783 (0.5129) 11.1 s
D.Tree	 0.6973 (0.4705) 16.1 s
RForest	 0.8853 (0.5185) 53.33 s
CatBoos	 0.9058 (0.567) 84.57 s
Naive	 0.7854 (0.456) 0.58 s
Light	 0.9054 (0.5877) 27.89 s


## Cross Validation by Windowing

In [10]:
cols = ['age', 'document.sexo', 'UTI', 'days_from_entrance']
t_cols = [c for c in dataset.columns if '4)' in c and (not 'time' in c)]

for i in [4,3,2,1,0]:
 
 if i == 4: cols.extend(t_cols)
 if i == 0:
 tN_cols = [c for c in dataset.columns if ('t)' in c or '_t-' in c) and (not 'time' in c)]
 cols.extend(tN_cols)
 else: 
 tN_cols = [c for c in dataset.columns if ('t-'+str(i) in c or '_t'+str(i) in c) and (not 'time' in c)]
 cols.extend(tN_cols)

 cols = list(set(cols))
 print('Number of Columns:', len(cols), 'Exam(s):', 5-i)
 print(cols)

 X_W = dataset[cols]
 Y_W = dataset["outcome"]

 for c in classifiers:
 start = time.time()
 model = classifiers[c]
 scores = cross_val_score(model, X_W, Y_W, cv=kfold, scoring='roc_auc')
 print ('\t' + c + '\t', round(scores.mean(),4), '(+-' + str(round(scores.std(),4)) + ')', round(time.time() - start,2), 's')

Number of Columns: 11 Exam(s): 1
['document.temperatura(t-4)', 'document.glicemia_capilar(t-4)', 'days_from_entrance', 'UTI', 'document.pa_diastolica(t-4)', 'document.sat_o2(t-4)', 'document.freq_respiratoria(t-4)', 'document.sexo', 'document.freq_cardiaca(t-4)', 'document.pa_sistolica(t-4)', 'age']
	XGBoost	 0.846 (+-0.0073) 2.25 s
	LogReg	 0.8045 (+-0.015) 0.54 s
	D.Tree	 0.6356 (+-0.0154) 0.76 s
	RForest	 0.8079 (+-0.0139) 7.39 s
	CatBoos	 0.8446 (+-0.01) 9.36 s
	Naive	 0.7696 (+-0.0138) 0.12 s
	Light	 0.8408 (+-0.0085) 2.75 s
Number of Columns: 25 Exam(s): 2
['document.temperatura(t-4)', 'delta_document.glicemia_capilar_t3-t4', 'UTI', 'document.freq_respiratoria(t-3)', 'delta_document.temperatura_t3-t4', 'age', 'document.sat_o2(t-4)', 'delta_document.freq_respiratoria_t3-t4', 'document.temperatura(t-3)', 'document.glicemia_capilar(t-3)', 'document.freq_cardiaca(t-4)', 'delta_document.sat_o2_t3-t4', 'delta_document.pa_diastolica_t3-t4', 'document.glicemia_capilar(t-4)', 'delta_docum