{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "laura_complete_experiments", "provenance": [], "collapsed_sections": [], "machine_shape": "hm" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "CeBA3sn_Snsh", "colab_type": "text" }, "source": [ "# References\n", " - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4736499/ (2016)\n", " - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4214112/ (2014)" ] }, { "cell_type": "markdown", "metadata": { "id": "LS-ZdnfuteJH", "colab_type": "text" }, "source": [ "## Load Libraries" ] }, { "cell_type": "code", "metadata": { "id": "qj0W32Yu0I1k", "colab_type": "code", "outputId": "f4870d56-499b-466e-d8e3-b27eb34a87eb", "colab": { "base_uri": "https://localhost:8080/", "height": 530 } }, "source": [ "!pip install catboost\n", "!pip install lightgbm\n", "!pip install missingpy" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "text": [ "Collecting catboost\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/94/ec/12b9a42b2ea7dfe5b602f235692ab2b61ee1334ff34334a15902272869e8/catboost-0.22-cp36-none-manylinux1_x86_64.whl (64.4MB)\n", "\u001b[K |████████████████████████████████| 64.4MB 47kB/s \n", "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from catboost) (1.12.0)\n", "Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.6/dist-packages (from catboost) (0.25.3)\n", "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from catboost) (1.4.1)\n", "Requirement already satisfied: plotly in /usr/local/lib/python3.6/dist-packages (from catboost) (4.4.1)\n", "Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from catboost) (1.17.5)\n", "Requirement already satisfied: graphviz in /usr/local/lib/python3.6/dist-packages (from catboost) (0.10.1)\n", "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from catboost) (3.1.3)\n", "Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.24.0->catboost) (2.6.1)\n", "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.24.0->catboost) (2018.9)\n", "Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.6/dist-packages (from plotly->catboost) (1.3.3)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (2.4.6)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (1.1.0)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (0.10.0)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from kiwisolver>=1.0.1->matplotlib->catboost) (45.1.0)\n", "Installing collected packages: catboost\n", "Successfully installed catboost-0.22\n", "Requirement already satisfied: lightgbm in /usr/local/lib/python3.6/dist-packages (2.2.3)\n", "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from lightgbm) (1.4.1)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from lightgbm) (0.22.1)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from lightgbm) (1.17.5)\n", "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->lightgbm) (0.14.1)\n", "Collecting missingpy\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/b5/be/998d04d27054b58f0974b5f09f8457778a0a72d4355e0b7ae877b6cfb850/missingpy-0.2.0-py3-none-any.whl (49kB)\n", "\u001b[K |████████████████████████████████| 51kB 1.7MB/s \n", "\u001b[?25hInstalling collected packages: missingpy\n", "Successfully installed missingpy-0.2.0\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "4unBvMjYr6LB", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 71 }, "outputId": "f5b54e0c-cf3d-4195-aa81-941234e82bdf" }, "source": [ "import pandas as pd\n", "from sklearn.model_selection import KFold\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.dummy import DummyClassifier\n", "from catboost import CatBoostClassifier\n", "from missingpy import MissForest\n", "from xgboost import XGBClassifier\n", "import lightgbm as lgb\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, f1_score\n", "import numpy as np\n", "import time\n", "import warnings\n", "warnings.filterwarnings('ignore')" ], "execution_count": 2, "outputs": [ { "output_type": "stream", "text": [ "/usr/local/lib/python3.6/dist-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.neighbors.base module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.neighbors. Anything that cannot be imported from sklearn.neighbors is now part of the private API.\n", " warnings.warn(message, FutureWarning)\n" ], "name": "stderr" } ] }, { "cell_type": "markdown", "metadata": { "id": "p9h9XFIPtiUL", "colab_type": "text" }, "source": [ "# Load Data" ] }, { "cell_type": "code", "metadata": { "id": "rY7GPVLO_mQU", "colab_type": "code", "colab": {} }, "source": [ "!wget -c https://unkown/dataset_treinamento_modelo_generico_filled.csv.gz" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "AxzNSpOdtn__", "colab_type": "code", "outputId": "76f4b0b7-93c0-49b6-a041-02841f06239b", "colab": { "base_uri": "https://localhost:8080/", "height": 816 } }, "source": [ "dataset = pd.read_csv(\"dataset_treinamento_modelo_generico_filled.csv.gz\")\n", "dataset.drop('Unnamed: 0', axis=1, inplace=True)\n", "dataset.shape, dataset.columns" ], "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "((121522, 80), Index(['entity', 'atendimento_id', 'days_from_entrance', 'age',\n", " 'document.sexo', 'UTI', 'absolute_timestamp', 'collect_timestamp(t)',\n", " 'collect_timestamp(t-1)', 'collect_timestamp(t-2)',\n", " 'collect_timestamp(t-3)', 'collect_timestamp(t-4)',\n", " 'delta_collect_timestamp_t-t1', 'delta_collect_timestamp_t1-t2',\n", " 'delta_collect_timestamp_t2-t3', 'delta_collect_timestamp_t3-t4',\n", " 'document.freq_cardiaca(t)', 'document.freq_cardiaca(t-1)',\n", " 'document.freq_cardiaca(t-2)', 'document.freq_cardiaca(t-3)',\n", " 'document.freq_cardiaca(t-4)', 'document.freq_respiratoria(t)',\n", " 'document.freq_respiratoria(t-1)', 'document.freq_respiratoria(t-2)',\n", " 'document.freq_respiratoria(t-3)', 'document.freq_respiratoria(t-4)',\n", " 'document.glicemia_capilar(t)', 'document.glicemia_capilar(t-1)',\n", " 'document.glicemia_capilar(t-2)', 'document.glicemia_capilar(t-3)',\n", " 'document.glicemia_capilar(t-4)', 'document.pa_diastolica(t)',\n", " 'document.pa_diastolica(t-1)', 'document.pa_diastolica(t-2)',\n", " 'document.pa_diastolica(t-3)', 'document.pa_diastolica(t-4)',\n", " 'document.pa_sistolica(t)', 'document.pa_sistolica(t-1)',\n", " 'document.pa_sistolica(t-2)', 'document.pa_sistolica(t-3)',\n", " 'document.pa_sistolica(t-4)', 'document.sat_o2(t)',\n", " 'document.sat_o2(t-1)', 'document.sat_o2(t-2)', 'document.sat_o2(t-3)',\n", " 'document.sat_o2(t-4)', 'document.temperatura(t)',\n", " 'document.temperatura(t-1)', 'document.temperatura(t-2)',\n", " 'document.temperatura(t-3)', 'document.temperatura(t-4)',\n", " 'delta_document.freq_cardiaca_t-t1',\n", " 'delta_document.freq_cardiaca_t1-t2',\n", " 'delta_document.freq_cardiaca_t2-t3',\n", " 'delta_document.freq_cardiaca_t3-t4',\n", " 'delta_document.freq_respiratoria_t-t1',\n", " 'delta_document.freq_respiratoria_t1-t2',\n", " 'delta_document.freq_respiratoria_t2-t3',\n", " 'delta_document.freq_respiratoria_t3-t4',\n", " 'delta_document.glicemia_capilar_t-t1',\n", " 'delta_document.glicemia_capilar_t1-t2',\n", " 'delta_document.glicemia_capilar_t2-t3',\n", " 'delta_document.glicemia_capilar_t3-t4',\n", " 'delta_document.pa_diastolica_t-t1',\n", " 'delta_document.pa_diastolica_t1-t2',\n", " 'delta_document.pa_diastolica_t2-t3',\n", " 'delta_document.pa_diastolica_t3-t4',\n", " 'delta_document.pa_sistolica_t-t1', 'delta_document.pa_sistolica_t1-t2',\n", " 'delta_document.pa_sistolica_t2-t3',\n", " 'delta_document.pa_sistolica_t3-t4', 'delta_document.sat_o2_t-t1',\n", " 'delta_document.sat_o2_t1-t2', 'delta_document.sat_o2_t2-t3',\n", " 'delta_document.sat_o2_t3-t4', 'delta_document.temperatura_t-t1',\n", " 'delta_document.temperatura_t1-t2', 'delta_document.temperatura_t2-t3',\n", " 'delta_document.temperatura_t3-t4', 'document.alta.motivo'],\n", " dtype='object'))" ] }, "metadata": { "tags": [] }, "execution_count": 4 } ] }, { "cell_type": "markdown", "metadata": { "id": "rWEX9Z8JzS_F", "colab_type": "text" }, "source": [ "## Cross Validation with Imputer\n", " - impute missing data\n", " - run experiments" ] }, { "cell_type": "code", "metadata": { "id": "Y9RWM6orzBma", "colab_type": "code", "colab": {} }, "source": [ "X = dataset.drop([\"document.alta.motivo\"], axis = 1)\n", "Y = dataset[\"document.alta.motivo\"]" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "U5f0R1bPaebk", "colab_type": "code", "colab": {} }, "source": [ "lightgbm_tunned = lgb.LGBMClassifier(boosting_type='gbdt', class_weight=None,\n", " colsample_bytree=0.9341899590668798, gamma=0.06731944764385,\n", " importance_type='split', learning_rate=0.11067874018709263,\n", " max_depth=14, min_child_samples=20,\n", " min_child_weight=14.46086218129473, min_split_gain=0.0,\n", " n_estimators=293, n_jobs=-1, num_leaves=31, objective=None,\n", " random_state=None, reg_alpha=11.767083608890678, reg_lambda=0.0,\n", " silent=True, subsample=0.8930069556095456,\n", " subsample_for_bin=200000, subsample_freq=0)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "_HeOe18a1zLo", "colab_type": "code", "colab": {} }, "source": [ "kfold = KFold(n_splits=10, random_state=7, shuffle=True)\n", "\n", "classifiers = {\n", " 'XGBoost' : XGBClassifier(learning_rate=0.1, n_estimators=100,random_state=7, tree_method='gpu_hist'),\n", " 'LogReg': LogisticRegression(solver='liblinear', multi_class='ovr'),\n", " 'D.Tree': DecisionTreeClassifier(),\n", " 'RForest': RandomForestClassifier(n_estimators = 50),\n", " 'CatBoos': CatBoostClassifier(learning_rate=0.1,n_estimators=100,random_state=7,task_type='GPU',verbose = False),\n", " 'Naive': GaussianNB(),\n", " 'Light': lightgbm_tunned\n", "}" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "AdsKY9LW2T62", "colab_type": "code", "outputId": "3966a5a4-e1dd-477a-8f6b-2f9dd6716e09", "colab": { "base_uri": "https://localhost:8080/", "height": 153 } }, "source": [ "print('Entity ID: all, Count', len(dataset))\n", "\n", "cols = ['delta_document.pa_diastolica_t2-t3', 'delta_document.pa_sistolica_t3-t4', 'document.freq_cardiaca(t-4)', 'document.glicemia_capilar(t-1)', 'document.freq_respiratoria(t-1)', 'delta_document.pa_sistolica_t-t1', 'document.sat_o2(t-4)', 'document.pa_sistolica(t)', 'document.pa_sistolica(t-3)', 'document.sat_o2(t-3)', 'delta_document.temperatura_t3-t4', 'delta_document.pa_diastolica_t1-t2', 'document.glicemia_capilar(t-4)', 'document.freq_respiratoria(t-3)', 'delta_document.sat_o2_t2-t3', 'document.freq_cardiaca(t)', 'document.pa_sistolica(t-1)', 'document.pa_diastolica(t)', 'UTI', 'age', 'document.sexo', 'document.freq_cardiaca(t-1)', 'delta_document.temperatura_t-t1', 'document.pa_diastolica(t-2)', 'delta_document.temperatura_t2-t3', 'document.sat_o2(t-2)', 'days_from_entrance', 'delta_document.glicemia_capilar_t-t1', 'delta_document.sat_o2_t3-t4', 'delta_document.pa_diastolica_t3-t4', 'document.freq_cardiaca(t-3)', 'document.freq_respiratoria(t)', 'delta_document.glicemia_capilar_t1-t2', 'delta_document.freq_respiratoria_t3-t4', 'document.freq_cardiaca(t-2)', 'delta_document.pa_sistolica_t2-t3', 'delta_document.freq_cardiaca_t-t1', 'document.temperatura(t-4)', 'document.temperatura(t)', 'delta_document.freq_cardiaca_t2-t3', 'document.temperatura(t-1)', 'document.pa_diastolica(t-3)', 'delta_document.freq_cardiaca_t1-t2', 'document.glicemia_capilar(t-3)', 'delta_document.pa_diastolica_t-t1', 'document.pa_sistolica(t-4)', 'document.pa_diastolica(t-1)', 'document.glicemia_capilar(t)', 'delta_document.sat_o2_t1-t2', 'document.pa_sistolica(t-2)', 'delta_document.freq_respiratoria_t-t1', 'delta_document.sat_o2_t-t1', 'document.pa_diastolica(t-4)', 'delta_document.freq_respiratoria_t2-t3', 'document.sat_o2(t)', 'delta_document.glicemia_capilar_t3-t4', 'delta_document.pa_sistolica_t1-t2', 'delta_document.temperatura_t1-t2', 'document.glicemia_capilar(t-2)', 'document.freq_respiratoria(t-2)', 'document.freq_respiratoria(t-4)', 'document.temperatura(t-3)', 'delta_document.freq_cardiaca_t3-t4', 'document.temperatura(t-2)', 'document.sat_o2(t-1)', 'delta_document.glicemia_capilar_t2-t3', 'delta_document.freq_respiratoria_t1-t2']\n", "X = dataset[cols]\n", "Y = dataset[\"document.alta.motivo\"]\n", "\n", "for c in classifiers:\n", " start = time.time()\n", " model = classifiers[c]\n", " scores = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')\n", " scores_f1 = cross_val_score(model, X, Y, cv=kfold, scoring='f1')\n", " print (c + '\\t', round(scores.mean(),4), '(' + str(round(scores_f1.mean(),4)) + ')', round(time.time() - start,2), 's')" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Entity ID: all, Count 121522\n", "XGBoost\t 0.9559 (0.632) 10.02 s\n", "LogReg\t 0.9326 (0.5565) 967.61 s\n", "D.Tree\t 0.7482 (0.4994) 156.02 s\n", "RForest\t 0.9406 (0.609) 546.14 s\n", "CatBoos\t 0.9555 (0.6426) 31.79 s\n", "Naive\t 0.8418 (0.3793) 3.45 s\n", "Light\t 0.9611 (0.6715) 312.0 s\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "9O1qKzHkTi8N", "colab_type": "text" }, "source": [ "# Cross Validation by Entity ID" ] }, { "cell_type": "code", "metadata": { "id": "qi0aIJOTTn_6", "colab_type": "code", "outputId": "601f0bd6-b9ce-4b9e-c81b-9ce5834a633f", "colab": { "base_uri": "https://localhost:8080/", "height": 833 } }, "source": [ "for id in dataset['entity'].unique():\n", " dataset_entity = dataset[dataset['entity']==id]\n", " print('Entity ID:', id, ', Count:',len(dataset_entity))\n", "\n", " X_ID = dataset_entity.drop([\"document.alta.motivo\"], axis = 1)\n", " Y_ID = dataset_entity[\"document.alta.motivo\"]\n", "\n", " for c in classifiers:\n", " start = time.time()\n", " model = classifiers[c]\n", " scores = cross_val_score(model, X_ID, Y_ID, cv=kfold, scoring='roc_auc')\n", " print ('\\t' + c + '\\t', round(scores.mean(),4), '(+-' + str(round(scores.std(),4)) + ')', round(time.time() - start,2), 's')" ], "execution_count": 8, "outputs": [ { "output_type": "stream", "text": [ "Entity ID: 1.0 , Count: 16674\n", "\tXGBoost\t 0.9776 (+-0.0107) 21.4 s\n", "\tLogReg\t 0.7975 (+-0.077) 5.53 s\n", "\tD.Tree\t 0.804 (+-0.0326) 10.39 s\n", "\tRForest\t 0.9532 (+-0.0172) 33.14 s\n", "\tCatBoos\t 0.9764 (+-0.013) 19.49 s\n", "\tNaive\t 0.8898 (+-0.0281) 0.25 s\n", "\tLight\t 0.9772 (+-0.0103) 12.59 s\n", "Entity ID: 3.0 , Count: 28279\n", "\tXGBoost\t 0.9587 (+-0.0068) 30.42 s\n", "\tLogReg\t 0.7345 (+-0.0986) 12.36 s\n", "\tD.Tree\t 0.7195 (+-0.0123) 20.82 s\n", "\tRForest\t 0.9346 (+-0.011) 59.67 s\n", "\tCatBoos\t 0.9548 (+-0.0061) 20.61 s\n", "\tNaive\t 0.8356 (+-0.0271) 0.43 s\n", "\tLight\t 0.9616 (+-0.0055) 27.89 s\n", "Entity ID: 5.0 , Count: 17091\n", "\tXGBoost\t 0.9156 (+-0.0061) 20.89 s\n", "\tLogReg\t 0.5858 (+-0.0222) 3.42 s\n", "\tD.Tree\t 0.7038 (+-0.0185) 12.17 s\n", "\tRForest\t 0.8955 (+-0.0074) 35.9 s\n", "\tCatBoos\t 0.9153 (+-0.0068) 21.0 s\n", "\tNaive\t 0.8151 (+-0.0097) 0.27 s\n", "\tLight\t 0.9179 (+-0.0078) 20.22 s\n", "Entity ID: 8.0 , Count: 19811\n", "\tXGBoost\t 0.9598 (+-0.0158) 21.0 s\n", "\tLogReg\t 0.717 (+-0.0359) 4.78 s\n", "\tD.Tree\t 0.8087 (+-0.0238) 14.32 s\n", "\tRForest\t 0.9491 (+-0.0156) 43.85 s\n", "\tCatBoos\t 0.9567 (+-0.016) 20.64 s\n", "\tNaive\t 0.8503 (+-0.0231) 0.31 s\n", "\tLight\t 0.9613 (+-0.014) 15.71 s\n", "Entity ID: 10.0 , Count: 6524\n", "\tXGBoost\t 0.9779 (+-0.0135) 10.7 s\n", "\tLogReg\t 0.7244 (+-0.0769) 2.81 s\n", "\tD.Tree\t 0.8429 (+-0.0395) 3.08 s\n", "\tRForest\t 0.9641 (+-0.0204) 10.12 s\n", "\tCatBoos\t 0.9794 (+-0.0132) 18.46 s\n", "\tNaive\t 0.8133 (+-0.0352) 0.13 s\n", "\tLight\t 0.9817 (+-0.0091) 5.07 s\n", "Entity ID: 38.0 , Count: 33143\n", "\tXGBoost\t 0.9643 (+-0.0063) 30.77 s\n", "\tLogReg\t 0.5936 (+-0.0282) 4.75 s\n", "\tD.Tree\t 0.7528 (+-0.0139) 30.18 s\n", "\tRForest\t 0.9483 (+-0.0084) 83.26 s\n", "\tCatBoos\t 0.9642 (+-0.0053) 20.45 s\n", "\tNaive\t 0.7082 (+-0.0269) 0.5 s\n", "\tLight\t 0.9668 (+-0.0064) 32.9 s\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "p90RFQ0fk5q1", "colab_type": "text" }, "source": [ "# Cross Validation by Windowing" ] }, { "cell_type": "code", "metadata": { "id": "m6MBaIn9p1yn", "colab_type": "code", "outputId": "b913bc16-cb89-4e63-a95c-06514ea08b69", "colab": { "base_uri": "https://localhost:8080/", "height": 802 } }, "source": [ "cols = ['age', 'document.sexo', 'UTI', 'days_from_entrance']\n", "t_cols = [c for c in dataset.columns if '4)' in c and (not 'time' in c)]\n", "\n", "for i in [4,3,2,1,0]:\n", " \n", " if i == 4: cols.extend(t_cols)\n", " if i == 0:\n", " tN_cols = [c for c in dataset.columns if ('t)' in c or '_t-' in c) and (not 'time' in c)]\n", " cols.extend(tN_cols)\n", " else: \n", " tN_cols = [c for c in dataset.columns if ('t-'+str(i) in c or '_t'+str(i) in c) and (not 'time' in c)]\n", " cols.extend(tN_cols)\n", "\n", " cols = list(set(cols))\n", " print('Número de Colunas:', len(cols), 'Exame(s):', 5-i)\n", " print(cols)\n", "\n", " X_N = dataset[cols]\n", " #X_N = imputer.fit_transform(X_N)\n", " Y_N = dataset[\"document.alta.motivo\"]\n", "\n", " for c in classifiers:\n", " start = time.time()\n", " model = classifiers[c]\n", " scores = cross_val_score(model, X_N, Y_N, cv=kfold, scoring='roc_auc')\n", " print ('\\t' + c + '\\t', round(scores.mean(),4), '(+-' + str(round(scores.std(),4)) + ')', round(time.time() - start,2), 's')" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Número de Colunas: 11 Exame(s): 1\n", "['document.glicemia_capilar(t-4)', 'document.pa_diastolica(t-4)', 'UTI', 'age', 'document.sexo', 'document.temperatura(t-4)', 'document.freq_cardiaca(t-4)', 'document.freq_respiratoria(t-4)', 'document.pa_sistolica(t-4)', 'days_from_entrance', 'document.sat_o2(t-4)']\n", "\tXGBoost\t 0.9285 (+-0.0047) 2.96 s\n", "\tLogReg\t 0.9047 (+-0.0054) 11.25 s\n", "\tD.Tree\t 0.7107 (+-0.0074) 7.48 s\n", "\tRForest\t 0.9064 (+-0.0046) 72.15 s\n", "\tCatBoos\t 0.9301 (+-0.0045) 9.83 s\n", "\tNaive\t 0.8578 (+-0.006) 0.44 s\n", "\tLight\t 0.9354 (+-0.0044) 44.18 s\n", "Número de Colunas: 25 Exame(s): 2\n", "['delta_document.pa_sistolica_t3-t4', 'document.freq_cardiaca(t-4)', 'document.sat_o2(t-4)', 'document.pa_sistolica(t-3)', 'document.sat_o2(t-3)', 'delta_document.temperatura_t3-t4', 'document.glicemia_capilar(t-4)', 'document.freq_respiratoria(t-3)', 'UTI', 'age', 'document.sexo', 'days_from_entrance', 'document.freq_cardiaca(t-3)', 'delta_document.pa_diastolica_t3-t4', 'delta_document.sat_o2_t3-t4', 'delta_document.freq_respiratoria_t3-t4', 'document.temperatura(t-4)', 'document.pa_diastolica(t-3)', 'document.glicemia_capilar(t-3)', 'document.pa_sistolica(t-4)', 'document.pa_diastolica(t-4)', 'delta_document.glicemia_capilar_t3-t4', 'document.freq_respiratoria(t-4)', 'document.temperatura(t-3)', 'delta_document.freq_cardiaca_t3-t4']\n", "\tXGBoost\t 0.9374 (+-0.005) 4.18 s\n", "\tLogReg\t 0.9138 (+-0.0064) 43.98 s\n", "\tD.Tree\t 0.7192 (+-0.0116) 20.04 s\n", "\tRForest\t 0.9141 (+-0.0085) 130.24 s\n", "\tCatBoos\t 0.9379 (+-0.0049) 13.25 s\n", "\tNaive\t 0.8337 (+-0.0067) 0.67 s\n", "\tLight\t 0.943 (+-0.0042) 71.26 s\n", "Número de Colunas: 39 Exame(s): 3\n", "['delta_document.pa_sistolica_t3-t4', 'delta_document.pa_diastolica_t2-t3', 'document.freq_cardiaca(t-4)', 'document.sat_o2(t-4)', 'document.pa_sistolica(t-3)', 'document.sat_o2(t-3)', 'delta_document.temperatura_t3-t4', 'document.glicemia_capilar(t-4)', 'document.freq_respiratoria(t-3)', 'delta_document.sat_o2_t2-t3', 'UTI', 'age', 'document.sexo', 'document.pa_diastolica(t-2)', 'delta_document.temperatura_t2-t3', 'document.sat_o2(t-2)', 'days_from_entrance', 'delta_document.sat_o2_t3-t4', 'delta_document.pa_diastolica_t3-t4', 'document.freq_cardiaca(t-3)', 'delta_document.freq_respiratoria_t3-t4', 'document.freq_cardiaca(t-2)', 'delta_document.pa_sistolica_t2-t3', 'document.temperatura(t-4)', 'delta_document.freq_cardiaca_t2-t3', 'document.pa_diastolica(t-3)', 'document.glicemia_capilar(t-3)', 'document.pa_sistolica(t-4)', 'document.pa_sistolica(t-2)', 'document.pa_diastolica(t-4)', 'delta_document.glicemia_capilar_t3-t4', 'document.glicemia_capilar(t-2)', 'document.freq_respiratoria(t-2)', 'document.freq_respiratoria(t-4)', 'document.temperatura(t-3)', 'delta_document.freq_cardiaca_t3-t4', 'document.temperatura(t-2)', 'delta_document.freq_respiratoria_t2-t3', 'delta_document.glicemia_capilar_t2-t3']\n", "\tXGBoost\t 0.944 (+-0.0045) 4.35 s\n", "\tLogReg\t 0.9201 (+-0.0058) 122.47 s\n", "\tD.Tree\t 0.7272 (+-0.0084) 35.78 s\n", "\tRForest\t 0.9237 (+-0.0074) 174.16 s\n", "\tCatBoos\t 0.9445 (+-0.0046) 14.22 s\n", "\tNaive\t 0.8316 (+-0.0074) 1.08 s\n", "\tLight\t 0.9495 (+-0.0041) 100.16 s\n", "Número de Colunas: 53 Exame(s): 4\n", "['delta_document.pa_diastolica_t2-t3', 'delta_document.pa_sistolica_t3-t4', 'document.freq_cardiaca(t-4)', 'document.freq_respiratoria(t-1)', 'document.glicemia_capilar(t-1)', 'document.sat_o2(t-4)', 'document.pa_sistolica(t-3)', 'document.sat_o2(t-3)', 'delta_document.temperatura_t3-t4', 'delta_document.pa_diastolica_t1-t2', 'document.glicemia_capilar(t-4)', 'document.freq_respiratoria(t-3)', 'delta_document.sat_o2_t2-t3', 'document.pa_sistolica(t-1)', 'UTI', 'age', 'document.sexo', 'document.freq_cardiaca(t-1)', 'document.pa_diastolica(t-2)', 'delta_document.temperatura_t2-t3', 'document.sat_o2(t-2)', 'days_from_entrance', 'delta_document.sat_o2_t3-t4', 'delta_document.pa_diastolica_t3-t4', 'document.freq_cardiaca(t-3)', 'delta_document.glicemia_capilar_t1-t2', 'delta_document.freq_respiratoria_t3-t4', 'document.freq_cardiaca(t-2)', 'delta_document.pa_sistolica_t2-t3', 'document.sat_o2(t-1)', 'document.temperatura(t-4)', 'delta_document.freq_cardiaca_t2-t3', 'document.temperatura(t-1)', 'document.pa_diastolica(t-3)', 'delta_document.freq_cardiaca_t1-t2', 'document.glicemia_capilar(t-3)', 'document.pa_sistolica(t-4)', 'document.pa_diastolica(t-1)', 'delta_document.sat_o2_t1-t2', 'document.pa_sistolica(t-2)', 'document.pa_diastolica(t-4)', 'delta_document.glicemia_capilar_t3-t4', 'delta_document.pa_sistolica_t1-t2', 'delta_document.temperatura_t1-t2', 'document.glicemia_capilar(t-2)', 'document.freq_respiratoria(t-2)', 'document.freq_respiratoria(t-4)', 'document.temperatura(t-3)', 'delta_document.freq_cardiaca_t3-t4', 'document.temperatura(t-2)', 'delta_document.freq_respiratoria_t2-t3', 'delta_document.glicemia_capilar_t2-t3', 'delta_document.freq_respiratoria_t1-t2']\n", "\tXGBoost\t 0.9506 (+-0.0043) 5.3 s\n", "\tLogReg\t 0.9286 (+-0.005) 236.43 s\n", "\tD.Tree\t 0.7362 (+-0.0049) 58.34 s\n", "\tRForest\t 0.9327 (+-0.0055) 221.28 s\n", "\tCatBoos\t 0.9502 (+-0.0041) 15.24 s\n", "\tNaive\t 0.8366 (+-0.007) 1.51 s\n", "\tLight\t 0.9559 (+-0.0038) 128.46 s\n", "Número de Colunas: 67 Exame(s): 5\n", "['delta_document.pa_diastolica_t2-t3', 'delta_document.pa_sistolica_t3-t4', 'document.freq_cardiaca(t-4)', 'document.glicemia_capilar(t-1)', 'document.freq_respiratoria(t-1)', 'delta_document.pa_sistolica_t-t1', 'document.sat_o2(t-4)', 'document.pa_sistolica(t)', 'document.pa_sistolica(t-3)', 'document.sat_o2(t-3)', 'delta_document.temperatura_t3-t4', 'delta_document.pa_diastolica_t1-t2', 'document.glicemia_capilar(t-4)', 'document.freq_respiratoria(t-3)', 'delta_document.sat_o2_t2-t3', 'document.freq_cardiaca(t)', 'document.pa_sistolica(t-1)', 'document.pa_diastolica(t)', 'UTI', 'age', 'document.sexo', 'document.freq_cardiaca(t-1)', 'delta_document.temperatura_t-t1', 'document.pa_diastolica(t-2)', 'delta_document.temperatura_t2-t3', 'document.sat_o2(t-2)', 'days_from_entrance', 'delta_document.glicemia_capilar_t-t1', 'delta_document.sat_o2_t3-t4', 'delta_document.pa_diastolica_t3-t4', 'document.freq_cardiaca(t-3)', 'document.freq_respiratoria(t)', 'delta_document.glicemia_capilar_t1-t2', 'delta_document.freq_respiratoria_t3-t4', 'document.freq_cardiaca(t-2)', 'delta_document.pa_sistolica_t2-t3', 'delta_document.freq_cardiaca_t-t1', 'document.temperatura(t-4)', 'document.temperatura(t)', 'delta_document.freq_cardiaca_t2-t3', 'document.temperatura(t-1)', 'document.pa_diastolica(t-3)', 'delta_document.freq_cardiaca_t1-t2', 'document.glicemia_capilar(t-3)', 'delta_document.pa_diastolica_t-t1', 'document.pa_sistolica(t-4)', 'document.pa_diastolica(t-1)', 'document.glicemia_capilar(t)', 'delta_document.sat_o2_t1-t2', 'document.pa_sistolica(t-2)', 'delta_document.freq_respiratoria_t-t1', 'delta_document.sat_o2_t-t1', 'document.pa_diastolica(t-4)', 'delta_document.freq_respiratoria_t2-t3', 'document.sat_o2(t)', 'delta_document.glicemia_capilar_t3-t4', 'delta_document.pa_sistolica_t1-t2', 'delta_document.temperatura_t1-t2', 'document.glicemia_capilar(t-2)', 'document.freq_respiratoria(t-2)', 'document.freq_respiratoria(t-4)', 'document.temperatura(t-3)', 'delta_document.freq_cardiaca_t3-t4', 'document.temperatura(t-2)', 'document.sat_o2(t-1)', 'delta_document.glicemia_capilar_t2-t3', 'delta_document.freq_respiratoria_t1-t2']\n", "\tXGBoost\t 0.9559 (+-0.0039) 5.1 s\n", "\tLogReg\t 0.9326 (+-0.0055) 493.28 s\n", "\tD.Tree\t 0.7465 (+-0.0076) 78.28 s\n", "\tRForest\t 0.9402 (+-0.006) 271.72 s\n", "\tCatBoos\t 0.9555 (+-0.0041) 15.67 s\n", "\tNaive\t 0.8418 (+-0.0068) 1.76 s\n", "\tLight\t 0.9611 (+-0.0039) 156.87 s\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "_DWvqPiFMYLA", "colab_type": "text" }, "source": [ "## MEWS Score" ] }, { "cell_type": "code", "metadata": { "id": "EOTgQcV5HUGZ", "colab_type": "code", "colab": {} }, "source": [ "## based on: https://www.mdcalc.com/modified-early-warning-score-mews-clinical-deterioration\n", "## Revisar com um Clínico\n", "def mews_score(x, t):\n", " systolic_bp = x['document.pa_sistolica(t'+t+')']\n", " heart_rate = x['document.freq_cardiaca(t'+t+')']\n", " respiratory_rate = x['document.freq_respiratoria(t'+t+')']\n", " temperature = x['document.temperatura(t'+t+')']\n", " avpu_score = 0 # not informed\n", "\n", " mews = 0\n", "\n", " if systolic_bp <= 70: mews += 3\n", " elif systolic_bp <= 80: mews += 2\n", " elif systolic_bp <= 100: mews += 1\n", " elif systolic_bp < 200: mews += 0\n", " else: mews += 2\n", "\n", " if heart_rate <= 40: mews += 2\n", " elif heart_rate <= 50: mews += 1\n", " elif heart_rate <= 100: mews += 0\n", " elif heart_rate <= 110: mews += 1\n", " elif heart_rate < 130: mews += 2\n", " else: mews += 3\n", "\n", " if respiratory_rate < 9: mews += 2\n", " elif respiratory_rate <= 14: mews += 0\n", " elif respiratory_rate <= 20: mews += 1\n", " elif respiratory_rate < 30: mews += 2\n", " else: mews += 3\n", "\n", " if temperature < 35: mews += 2\n", " elif temperature <= 38.4: mews += 0\n", " else: mews += 2\n", "\n", " return mews" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ap1KrRypMewl", "colab_type": "code", "colab": {} }, "source": [ "columns = dataset.columns.drop([\"document.alta.motivo\"])\n", "pd_mews = pd.DataFrame()\n", "pd_mews['t4'] = pd.DataFrame(X, columns=columns).apply(mews_score, axis=1, args=('-4',))\n", "pd_mews['t3'] = pd.DataFrame(X, columns=columns).apply(mews_score, axis=1, args=('-3',))\n", "pd_mews['t2'] = pd.DataFrame(X, columns=columns).apply(mews_score, axis=1, args=('-2',))\n", "pd_mews['t1'] = pd.DataFrame(X, columns=columns).apply(mews_score, axis=1, args=('-1',))\n", "pd_mews['t'] = pd.DataFrame(X, columns=columns).apply(mews_score, axis=1, args=('',))" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "odd021P1L52f", "colab_type": "code", "outputId": "ed11498e-a990-48d8-a51b-6648e86521ca", "colab": { "base_uri": "https://localhost:8080/", "height": 442 } }, "source": [ "for t in ['t4','t3','t2','t1','t']:\n", " print('----',t,'-----')\n", " for i in range(4): \n", " mews_binary = (pd_mews[t] > i) * 1\n", " print(i, round(roc_auc_score(Y, mews_binary),4), round(f1_score(Y, mews_binary),4))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "---- t4 -----\n", "0 0.5042 0.0965\n", "1 0.6584 0.1634\n", "2 0.626 0.1814\n", "3 0.581 0.1659\n", "---- t3 -----\n", "0 0.5045 0.0966\n", "1 0.6738 0.1705\n", "2 0.6405 0.1991\n", "3 0.5904 0.1799\n", "---- t2 -----\n", "0 0.5029 0.0963\n", "1 0.6768 0.17\n", "2 0.6513 0.2053\n", "3 0.5958 0.182\n", "---- t1 -----\n", "0 0.5024 0.0962\n", "1 0.6891 0.1742\n", "2 0.6718 0.2258\n", "3 0.6104 0.2025\n", "---- t -----\n", "0 0.5017 0.0961\n", "1 0.6976 0.1755\n", "2 0.6829 0.2321\n", "3 0.6201 0.2075\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "JQKh8J3MNxqW", "colab_type": "code", "colab": {} }, "source": [ "confusion_matrix(Y, mews_binary)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "IE5oGJnQ_WRn", "colab_type": "text" }, "source": [ "# NEWS2 Score" ] }, { "cell_type": "code", "metadata": { "id": "3JVte1ig9mHe", "colab_type": "code", "colab": {} }, "source": [ "## based on: https://www.rcplondon.ac.uk/projects/outputs/national-early-warning-score-news-2\n", "## Revisar com um Clínico\n", "def news2_score(x, t):\n", " systolic_bp = x['document.pa_sistolica(t'+t+')']\n", " heart_rate = x['document.freq_cardiaca(t'+t+')']\n", " respiratory_rate = x['document.freq_respiratoria(t'+t+')']\n", " temperature = x['document.temperatura(t'+t+')']\n", " avpu_score = 0 # not informed\n", "\n", " news2 = 0\n", "\n", " if systolic_bp <= 90: news2 += 3\n", " elif systolic_bp <= 100: news2 += 2\n", " elif systolic_bp <= 110: news2 += 1\n", " elif systolic_bp < 220: news2 += 0\n", " else: news2 += 3\n", "\n", " if heart_rate <= 40: news2 += 3\n", " elif heart_rate <= 50: news2 += 1\n", " elif heart_rate <= 90: news2 += 0\n", " elif heart_rate <= 110: news2 += 1\n", " elif heart_rate < 130: news2 += 2\n", " else: news2 += 3\n", "\n", " if respiratory_rate <= 8: news2 += 3\n", " elif respiratory_rate <= 11: news2 += 1\n", " elif respiratory_rate <= 20: news2 += 0\n", " elif respiratory_rate <= 24: news2 += 2\n", " else: news2 += 3\n", "\n", " if temperature <= 35: news2 += 3\n", " elif temperature <= 36: news2 += 1\n", " elif temperature <= 38: news2 += 0\n", " elif temperature <= 39: news2 += 1\n", " else: news2 += 2\n", "\n", " return news2" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "dqqTPxN__aFF", "colab_type": "code", "colab": {} }, "source": [ "columns = dataset.columns.drop([\"document.alta.motivo\"])\n", "pd_news2 = pd.DataFrame()\n", "pd_news2['t4'] = pd.DataFrame(X, columns=columns).apply(news2_score, axis=1, args=('-4',))\n", "pd_news2['t3'] = pd.DataFrame(X, columns=columns).apply(news2_score, axis=1, args=('-3',))\n", "pd_news2['t2'] = pd.DataFrame(X, columns=columns).apply(news2_score, axis=1, args=('-2',))\n", "pd_news2['t1'] = pd.DataFrame(X, columns=columns).apply(news2_score, axis=1, args=('-1',))\n", "pd_news2['t'] = pd.DataFrame(X, columns=columns).apply(news2_score, axis=1, args=('',))" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "bMWrrLIc_cXi", "colab_type": "code", "outputId": "64bcdf63-c8e0-42c9-fdbe-a83e68599eb6", "colab": { "base_uri": "https://localhost:8080/", "height": 612 } }, "source": [ "for t in ['t4','t3','t2','t1','t']:\n", " print('----',t,'-----')\n", " for i in range(6): \n", " news2_binary = (pd_news2[t] > i) * 1\n", " print(i, round(roc_auc_score(Y, news2_binary),4), round(f1_score(Y, news2_binary),4))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "---- t4 -----\n", "0 0.5627 0.1086\n", "1 0.6518 0.1467\n", "2 0.6452 0.1667\n", "3 0.6216 0.179\n", "4 0.5836 0.1634\n", "5 0.5506 0.1348\n", "---- t3 -----\n", "0 0.5716 0.1112\n", "1 0.6429 0.1449\n", "2 0.6512 0.1728\n", "3 0.6242 0.1831\n", "4 0.5804 0.1591\n", "5 0.551 0.1339\n", "---- t2 -----\n", "0 0.5691 0.1106\n", "1 0.6501 0.1475\n", "2 0.6595 0.1752\n", "3 0.6269 0.1823\n", "4 0.5851 0.163\n", "5 0.5521 0.1338\n", "---- t1 -----\n", "0 0.5785 0.1128\n", "1 0.666 0.1523\n", "2 0.6783 0.1853\n", "3 0.6501 0.2049\n", "4 0.604 0.1889\n", "5 0.5657 0.1582\n", "---- t -----\n", "0 0.5846 0.1139\n", "1 0.6822 0.1565\n", "2 0.7048 0.1964\n", "3 0.6773 0.2231\n", "4 0.6232 0.2037\n", "5 0.5754 0.1673\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "MuoTQyR-Kz80", "colab_type": "text" }, "source": [ "# Average Time Interval" ] }, { "cell_type": "code", "metadata": { "id": "gudVKbBbPi-R", "colab_type": "code", "outputId": "b9da3363-6d08-41dd-8610-9946c47a04bc", "colab": { "base_uri": "https://localhost:8080/", "height": 102 } }, "source": [ "delta_t_cols = [c for c in dataset.columns if 'delta_collect' in c]\n", "dataset[delta_t_cols].mean() / 3600" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "delta_collect_timestamp_t-t1 4.235348\n", "delta_collect_timestamp_t1-t2 3.413571\n", "delta_collect_timestamp_t2-t3 3.116624\n", "delta_collect_timestamp_t3-t4 3.233622\n", "dtype: float64" ] }, "metadata": { "tags": [] }, "execution_count": 19 } ] } ] }