{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Sliding window hold\n", "- K Hold 말고, 기간이 있으면 누적해서 Train을 설정하는 Hold\n", "- 주로 시계열 데이터를 다룰 때 사용\n", "- [참고 문서](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html)\n", "" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.model_selection import TimeSeriesSplit\n", "from sklearn.preprocessing import LabelEncoder\n", "import xgboost as xgb" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "date_index = pd.date_range(start='2019-01-01', end='2019-01-30', freq='1D')\n", "X = pd.DataFrame(date_index, columns=['date'])\n", "X['dummy'] = 'a'\n", "X['label'] = 1" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "y = X['label']" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "del X['label']" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datedummy
02019-01-01a
12019-01-02a
22019-01-03a
32019-01-04a
42019-01-05a
\n", "
" ], "text/plain": [ " date dummy\n", "0 2019-01-01 a\n", "1 2019-01-02 a\n", "2 2019-01-03 a\n", "3 2019-01-04 a\n", "4 2019-01-05 a" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.head()" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRAIN: [0 1 2 3 4] TEST: [5 6 7 8 9]\n", "X_train\n", " date dummy date_encoding\n", "0 0 0 0\n", "1 1 0 1\n", "2 2 0 2\n", "\n", "X_test\n", " date dummy date_encoding\n", "5 5 0 5\n", "6 6 0 6\n", "7 7 0 7\n", "TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11 12 13 14]\n", "X_train\n", " date dummy date_encoding\n", "0 0 0 0\n", "1 1 0 1\n", "2 2 0 2\n", "\n", "X_test\n", " date dummy date_encoding\n", "10 10 0 10\n", "11 11 0 11\n", "12 12 0 12\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] TEST: [15 16 17 18 19]\n", "X_train\n", " date dummy date_encoding\n", "0 0 0 0\n", "1 1 0 1\n", "2 2 0 2\n", "\n", "X_test\n", " date dummy date_encoding\n", "15 15 0 15\n", "16 16 0 16\n", "17 17 0 17\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19] TEST: [20 21 22 23 24]\n", "X_train\n", " date dummy date_encoding\n", "0 0 0 0\n", "1 1 0 1\n", "2 2 0 2\n", "\n", "X_test\n", " date dummy date_encoding\n", "20 20 0 20\n", "21 21 0 21\n", "22 22 0 22\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23\n", " 24] TEST: [25 26 27 28 29]\n", "X_train\n", " date dummy date_encoding\n", "0 0 0 0\n", "1 1 0 1\n", "2 2 0 2\n", "\n", "X_test\n", " date dummy date_encoding\n", "25 25 0 25\n", "26 26 0 26\n", "27 27 0 27\n" ] } ], "source": [ "# 데이터가 이미 date 기준으로 sort 되었다고 가정하고 진행\n", "tscv = TimeSeriesSplit(n_splits=5)\n", "for train_index, test_index in tscv.split(X):\n", " print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n", " X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]\n", " y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]\n", " print(\"X_train\")\n", " print(X_train.head(3))\n", " print()\n", " print(\"X_test\")\n", " print(X_test.head(3))" ] }, { "cell_type": "code", "execution_count": 154, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRAIN: [0 1 2 3 4 5 6 7 8] TEST: [ 9 10 11 12 13 14 15]\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] TEST: [16 17 18 19 20 21 22]\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22] TEST: [23 24 25 26 27 28 29]\n" ] } ], "source": [ "tscv = TimeSeriesSplit(n_splits=3)\n", "for train_index, test_index in tscv.split(X):\n", " print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n", " X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]\n", " y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11]\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11] TEST: [12 13]\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13] TEST: [14 15]\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] TEST: [16 17]\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17] TEST: [18 19]\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19] TEST: [20 21]\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21] TEST: [22 23]\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] TEST: [24 25]\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23\n", " 24 25] TEST: [26 27]\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23\n", " 24 25 26 27] TEST: [28 29]\n" ] } ], "source": [ "tscv = TimeSeriesSplit(n_splits=10)\n", "for train_index, test_index in tscv.split(X):\n", " print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n", " X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]\n", " y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Simple Modeling" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "le = LabelEncoder()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datedummy
02019-01-01a
12019-01-02a
\n", "
" ], "text/plain": [ " date dummy\n", "0 2019-01-01 a\n", "1 2019-01-02 a" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.head(2)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LabelEncoder()" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "le.fit(X['date'])" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "X['date'] = le.transform(X['date']) \n", "X['dummy'] = 0" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datedummydate_encoding
0000
1101
\n", "
" ], "text/plain": [ " date dummy date_encoding\n", "0 0 0 0\n", "1 1 0 1" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.head(2)" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TRAIN: [0 1 2 3 4 5] TEST: [ 6 7 8 9 10 11]\n", "6 6 6\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11] TEST: [12 13 14 15 16 17]\n", "12 12 6\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17] TEST: [18 19 20 21 22 23]\n", "18 18 6\n", "TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] TEST: [24 25 26 27 28 29]\n", "24 24 6\n" ] } ], "source": [ "xgb_preds = []\n", "tscv = TimeSeriesSplit(n_splits=4)\n", "for train_index, test_index in tscv.split(X):\n", " print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n", " X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]\n", " y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]\n", " print(len(X_train), len(y_train), len(X_test))\n", " xgb_params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}\n", " xgb_regressor = xgb.XGBRegressor(n_estimators=1000)\n", " xgb_model = xgb_regressor.fit(X_train, y_train, verbose=False)\n", " \n", " xgb_pred = xgb_model.predict(X_test)\n", "\n", " xgb_preds.append(list(xgb_pred))\n", "# print('cv', cross_val_score(xgb_model, X_train, y_train, cv=tscv, scoring='accuracy'))" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [], "source": [ "preds=[]\n", "for i in range(len(xgb_preds[0])):\n", " sum=0\n", " for j in range(4):\n", " sum+=xgb_preds[j][i]\n", " preds.append(sum / 4)\n", "\n", "output = pd.DataFrame({'id': 'unknown', 'target': preds})" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtarget
0unknown1.0
1unknown1.0
2unknown1.0
3unknown1.0
4unknown1.0
5unknown1.0
\n", "
" ], "text/plain": [ " id target\n", "0 unknown 1.0\n", "1 unknown 1.0\n", "2 unknown 1.0\n", "3 unknown 1.0\n", "4 unknown 1.0\n", "5 unknown 1.0" ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }