{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sliding window hold\n",
    "- K Hold 말고, 기간이 있으면 누적해서 Train을 설정하는 Hold\n",
    "- 주로 시계열 데이터를 다룰 때 사용\n",
    "- [참고 문서](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html)\n",
    "<img src=\"https://habrastorage.org/webt/8i/5k/vx/8i5kvxrehatyvf-l3glz_-ymhtw.png\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import TimeSeriesSplit\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "import xgboost as xgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "date_index = pd.date_range(start='2019-01-01', end='2019-01-30', freq='1D')\n",
    "X = pd.DataFrame(date_index, columns=['date'])\n",
    "X['dummy'] = 'a'\n",
    "X['label'] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = X['label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "del X['label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>dummy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2019-01-01</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2019-01-02</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2019-01-03</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2019-01-04</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2019-01-05</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        date dummy\n",
       "0 2019-01-01     a\n",
       "1 2019-01-02     a\n",
       "2 2019-01-03     a\n",
       "3 2019-01-04     a\n",
       "4 2019-01-05     a"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TRAIN: [0 1 2 3 4] TEST: [5 6 7 8 9]\n",
      "X_train\n",
      "   date  dummy  date_encoding\n",
      "0     0      0              0\n",
      "1     1      0              1\n",
      "2     2      0              2\n",
      "\n",
      "X_test\n",
      "   date  dummy  date_encoding\n",
      "5     5      0              5\n",
      "6     6      0              6\n",
      "7     7      0              7\n",
      "TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11 12 13 14]\n",
      "X_train\n",
      "   date  dummy  date_encoding\n",
      "0     0      0              0\n",
      "1     1      0              1\n",
      "2     2      0              2\n",
      "\n",
      "X_test\n",
      "    date  dummy  date_encoding\n",
      "10    10      0             10\n",
      "11    11      0             11\n",
      "12    12      0             12\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14] TEST: [15 16 17 18 19]\n",
      "X_train\n",
      "   date  dummy  date_encoding\n",
      "0     0      0              0\n",
      "1     1      0              1\n",
      "2     2      0              2\n",
      "\n",
      "X_test\n",
      "    date  dummy  date_encoding\n",
      "15    15      0             15\n",
      "16    16      0             16\n",
      "17    17      0             17\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] TEST: [20 21 22 23 24]\n",
      "X_train\n",
      "   date  dummy  date_encoding\n",
      "0     0      0              0\n",
      "1     1      0              1\n",
      "2     2      0              2\n",
      "\n",
      "X_test\n",
      "    date  dummy  date_encoding\n",
      "20    20      0             20\n",
      "21    21      0             21\n",
      "22    22      0             22\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23\n",
      " 24] TEST: [25 26 27 28 29]\n",
      "X_train\n",
      "   date  dummy  date_encoding\n",
      "0     0      0              0\n",
      "1     1      0              1\n",
      "2     2      0              2\n",
      "\n",
      "X_test\n",
      "    date  dummy  date_encoding\n",
      "25    25      0             25\n",
      "26    26      0             26\n",
      "27    27      0             27\n"
     ]
    }
   ],
   "source": [
    "# 데이터가 이미 date 기준으로 sort 되었다고 가정하고 진행\n",
    "tscv = TimeSeriesSplit(n_splits=5)\n",
    "for train_index, test_index in tscv.split(X):\n",
    "    print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
    "    X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]\n",
    "    y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]\n",
    "    print(\"X_train\")\n",
    "    print(X_train.head(3))\n",
    "    print()\n",
    "    print(\"X_test\")\n",
    "    print(X_test.head(3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TRAIN: [0 1 2 3 4 5 6 7 8] TEST: [ 9 10 11 12 13 14 15]\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15] TEST: [16 17 18 19 20 21 22]\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22] TEST: [23 24 25 26 27 28 29]\n"
     ]
    }
   ],
   "source": [
    "tscv = TimeSeriesSplit(n_splits=3)\n",
    "for train_index, test_index in tscv.split(X):\n",
    "    print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
    "    X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]\n",
    "    y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11]\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11] TEST: [12 13]\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13] TEST: [14 15]\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15] TEST: [16 17]\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17] TEST: [18 19]\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] TEST: [20 21]\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21] TEST: [22 23]\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] TEST: [24 25]\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23\n",
      " 24 25] TEST: [26 27]\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23\n",
      " 24 25 26 27] TEST: [28 29]\n"
     ]
    }
   ],
   "source": [
    "tscv = TimeSeriesSplit(n_splits=10)\n",
    "for train_index, test_index in tscv.split(X):\n",
    "    print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
    "    X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]\n",
    "    y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Simple Modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "le = LabelEncoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>dummy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2019-01-01</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2019-01-02</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        date dummy\n",
       "0 2019-01-01     a\n",
       "1 2019-01-02     a"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LabelEncoder()"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "le.fit(X['date'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['date'] = le.transform(X['date']) \n",
    "X['dummy'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>dummy</th>\n",
       "      <th>date_encoding</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   date  dummy  date_encoding\n",
       "0     0      0              0\n",
       "1     1      0              1"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TRAIN: [0 1 2 3 4 5] TEST: [ 6  7  8  9 10 11]\n",
      "6 6 6\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11] TEST: [12 13 14 15 16 17]\n",
      "12 12 6\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17] TEST: [18 19 20 21 22 23]\n",
      "18 18 6\n",
      "TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] TEST: [24 25 26 27 28 29]\n",
      "24 24 6\n"
     ]
    }
   ],
   "source": [
    "xgb_preds = []\n",
    "tscv = TimeSeriesSplit(n_splits=4)\n",
    "for train_index, test_index in tscv.split(X):\n",
    "    print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
    "    X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]\n",
    "    y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]\n",
    "    print(len(X_train), len(y_train), len(X_test))\n",
    "    xgb_params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}\n",
    "    xgb_regressor = xgb.XGBRegressor(n_estimators=1000)\n",
    "    xgb_model = xgb_regressor.fit(X_train, y_train,  verbose=False)\n",
    "                        \n",
    "    xgb_pred = xgb_model.predict(X_test)\n",
    "\n",
    "    xgb_preds.append(list(xgb_pred))\n",
    "#     print('cv', cross_val_score(xgb_model, X_train, y_train, cv=tscv, scoring='accuracy'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [],
   "source": [
    "preds=[]\n",
    "for i in range(len(xgb_preds[0])):\n",
    "    sum=0\n",
    "    for j in range(4):\n",
    "        sum+=xgb_preds[j][i]\n",
    "    preds.append(sum / 4)\n",
    "\n",
    "output = pd.DataFrame({'id': 'unknown', 'target': preds})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>unknown</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>unknown</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>unknown</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>unknown</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>unknown</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>unknown</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        id  target\n",
       "0  unknown     1.0\n",
       "1  unknown     1.0\n",
       "2  unknown     1.0\n",
       "3  unknown     1.0\n",
       "4  unknown     1.0\n",
       "5  unknown     1.0"
      ]
     },
     "execution_count": 145,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}