{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sliding window hold\n",
"- K Hold 말고, 기간이 있으면 누적해서 Train을 설정하는 Hold\n",
"- 주로 시계열 데이터를 다룰 때 사용\n",
"- [참고 문서](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html)\n",
""
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import TimeSeriesSplit\n",
"from sklearn.preprocessing import LabelEncoder\n",
"import xgboost as xgb"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"date_index = pd.date_range(start='2019-01-01', end='2019-01-30', freq='1D')\n",
"X = pd.DataFrame(date_index, columns=['date'])\n",
"X['dummy'] = 'a'\n",
"X['label'] = 1"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"y = X['label']"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"del X['label']"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" dummy | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2019-01-01 | \n",
" a | \n",
"
\n",
" \n",
" 1 | \n",
" 2019-01-02 | \n",
" a | \n",
"
\n",
" \n",
" 2 | \n",
" 2019-01-03 | \n",
" a | \n",
"
\n",
" \n",
" 3 | \n",
" 2019-01-04 | \n",
" a | \n",
"
\n",
" \n",
" 4 | \n",
" 2019-01-05 | \n",
" a | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date dummy\n",
"0 2019-01-01 a\n",
"1 2019-01-02 a\n",
"2 2019-01-03 a\n",
"3 2019-01-04 a\n",
"4 2019-01-05 a"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TRAIN: [0 1 2 3 4] TEST: [5 6 7 8 9]\n",
"X_train\n",
" date dummy date_encoding\n",
"0 0 0 0\n",
"1 1 0 1\n",
"2 2 0 2\n",
"\n",
"X_test\n",
" date dummy date_encoding\n",
"5 5 0 5\n",
"6 6 0 6\n",
"7 7 0 7\n",
"TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11 12 13 14]\n",
"X_train\n",
" date dummy date_encoding\n",
"0 0 0 0\n",
"1 1 0 1\n",
"2 2 0 2\n",
"\n",
"X_test\n",
" date dummy date_encoding\n",
"10 10 0 10\n",
"11 11 0 11\n",
"12 12 0 12\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] TEST: [15 16 17 18 19]\n",
"X_train\n",
" date dummy date_encoding\n",
"0 0 0 0\n",
"1 1 0 1\n",
"2 2 0 2\n",
"\n",
"X_test\n",
" date dummy date_encoding\n",
"15 15 0 15\n",
"16 16 0 16\n",
"17 17 0 17\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19] TEST: [20 21 22 23 24]\n",
"X_train\n",
" date dummy date_encoding\n",
"0 0 0 0\n",
"1 1 0 1\n",
"2 2 0 2\n",
"\n",
"X_test\n",
" date dummy date_encoding\n",
"20 20 0 20\n",
"21 21 0 21\n",
"22 22 0 22\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23\n",
" 24] TEST: [25 26 27 28 29]\n",
"X_train\n",
" date dummy date_encoding\n",
"0 0 0 0\n",
"1 1 0 1\n",
"2 2 0 2\n",
"\n",
"X_test\n",
" date dummy date_encoding\n",
"25 25 0 25\n",
"26 26 0 26\n",
"27 27 0 27\n"
]
}
],
"source": [
"# 데이터가 이미 date 기준으로 sort 되었다고 가정하고 진행\n",
"tscv = TimeSeriesSplit(n_splits=5)\n",
"for train_index, test_index in tscv.split(X):\n",
" print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
" X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]\n",
" y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]\n",
" print(\"X_train\")\n",
" print(X_train.head(3))\n",
" print()\n",
" print(\"X_test\")\n",
" print(X_test.head(3))"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TRAIN: [0 1 2 3 4 5 6 7 8] TEST: [ 9 10 11 12 13 14 15]\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] TEST: [16 17 18 19 20 21 22]\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22] TEST: [23 24 25 26 27 28 29]\n"
]
}
],
"source": [
"tscv = TimeSeriesSplit(n_splits=3)\n",
"for train_index, test_index in tscv.split(X):\n",
" print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
" X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]\n",
" y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]"
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11]\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11] TEST: [12 13]\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13] TEST: [14 15]\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] TEST: [16 17]\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17] TEST: [18 19]\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19] TEST: [20 21]\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21] TEST: [22 23]\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] TEST: [24 25]\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23\n",
" 24 25] TEST: [26 27]\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23\n",
" 24 25 26 27] TEST: [28 29]\n"
]
}
],
"source": [
"tscv = TimeSeriesSplit(n_splits=10)\n",
"for train_index, test_index in tscv.split(X):\n",
" print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
" X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]\n",
" y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Simple Modeling"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"le = LabelEncoder()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" dummy | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2019-01-01 | \n",
" a | \n",
"
\n",
" \n",
" 1 | \n",
" 2019-01-02 | \n",
" a | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date dummy\n",
"0 2019-01-01 a\n",
"1 2019-01-02 a"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LabelEncoder()"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"le.fit(X['date'])"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"X['date'] = le.transform(X['date']) \n",
"X['dummy'] = 0"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" dummy | \n",
" date_encoding | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date dummy date_encoding\n",
"0 0 0 0\n",
"1 1 0 1"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TRAIN: [0 1 2 3 4 5] TEST: [ 6 7 8 9 10 11]\n",
"6 6 6\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11] TEST: [12 13 14 15 16 17]\n",
"12 12 6\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17] TEST: [18 19 20 21 22 23]\n",
"18 18 6\n",
"TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] TEST: [24 25 26 27 28 29]\n",
"24 24 6\n"
]
}
],
"source": [
"xgb_preds = []\n",
"tscv = TimeSeriesSplit(n_splits=4)\n",
"for train_index, test_index in tscv.split(X):\n",
" print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
" X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]\n",
" y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]\n",
" print(len(X_train), len(y_train), len(X_test))\n",
" xgb_params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}\n",
" xgb_regressor = xgb.XGBRegressor(n_estimators=1000)\n",
" xgb_model = xgb_regressor.fit(X_train, y_train, verbose=False)\n",
" \n",
" xgb_pred = xgb_model.predict(X_test)\n",
"\n",
" xgb_preds.append(list(xgb_pred))\n",
"# print('cv', cross_val_score(xgb_model, X_train, y_train, cv=tscv, scoring='accuracy'))"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [],
"source": [
"preds=[]\n",
"for i in range(len(xgb_preds[0])):\n",
" sum=0\n",
" for j in range(4):\n",
" sum+=xgb_preds[j][i]\n",
" preds.append(sum / 4)\n",
"\n",
"output = pd.DataFrame({'id': 'unknown', 'target': preds})"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" unknown | \n",
" 1.0 | \n",
"
\n",
" \n",
" 1 | \n",
" unknown | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2 | \n",
" unknown | \n",
" 1.0 | \n",
"
\n",
" \n",
" 3 | \n",
" unknown | \n",
" 1.0 | \n",
"
\n",
" \n",
" 4 | \n",
" unknown | \n",
" 1.0 | \n",
"
\n",
" \n",
" 5 | \n",
" unknown | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id target\n",
"0 unknown 1.0\n",
"1 unknown 1.0\n",
"2 unknown 1.0\n",
"3 unknown 1.0\n",
"4 unknown 1.0\n",
"5 unknown 1.0"
]
},
"execution_count": 145,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"output"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}