{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import lightgbm as lgb\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load data\n",
    "\n",
    "\n",
    "\n",
    "CRU TS4.01: Climatic Research Unit (CRU) Time-Series (TS) version 4.01 of high-resolution gridded data of month-by-month variation in climate (Jan. 1901- Dec. 2016)  \n",
    "\n",
    "You can download CRU TS4.01 data set from CDEA Archive (needs registration)  \n",
    "http://data.ceda.ac.uk/badc/cru/data/cru_ts/cru_ts_4.01/data/tmp  \n",
    "\n",
    "Note that use of these data is covered by the following licence:  \n",
    "http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/  \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read dat file\n",
    "data = pd.read_csv(\"../data/cru_ts4.01.1901.2016.tmp.dat\", delim_whitespace=True, header=None).values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reshape\n",
    "data = data.reshape((-1, 360*720))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ignore invalid grids\n",
    "vvv = (data == -999).sum(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "67420"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# same as the paper\n",
    "(vvv == 0).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_live = data[:, vvv == 0].transpose().astype(\"int32\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(67420, 1392)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_live.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(395, -595)"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# seems 10 times degrees celsius\n",
    "# -595 means -59.5 \n",
    "# 395 means 39.5\n",
    "data_live.max(), data_live.min()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sample Data 1\n",
    "\n",
    "We want similar situation as MODEL4 in the paper, that says 97% Accuracy.  \n",
    "Here, we randomly pick grid and randomly pick year, around 500K in total. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_size = 519718\n",
    "\n",
    "np.random.seed(71)\n",
    "sample_indices = np.random.choice(67420*77, sample_size, replace=False)\n",
    "train_size = 519718 * 3 // 4"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The definition of RISE/FALL in the paper is not clear for me, so simply compare 30 years mean / 10 years mean  \n",
    "(the paper says based on the mean temperature for 10 years after the training period.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = np.zeros((sample_size, 360))\n",
    "y = np.zeros(sample_size).astype(\"int\")\n",
    "\n",
    "for i in range(sample_size):\n",
    "    sample_ind = sample_indices[i]\n",
    "    grid_ind = sample_ind // 77\n",
    "    year_ind = sample_ind % 77\n",
    "    X[i, :] = data_live[grid_ind, (year_ind*12):(year_ind*12+360)]\n",
    "    mean30 = X[i, :].mean()\n",
    "    mean10 = data_live[grid_ind, (year_ind*12+360):(year_ind*12+480)].mean()\n",
    "    if mean10 > mean30:\n",
    "        y[i] = 1\n",
    "    else:\n",
    "        y[i] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(156818, 362900)"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# around same distribution\n",
    "(1-y).sum(), y.sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build Model with LightGBM\n",
    "### raw X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = lgb.Dataset(X[:train_size, :], y[:train_size])\n",
    "valid_data = lgb.Dataset(X[train_size:, :], y[train_size:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {\n",
    "    \"objective\" : \"binary\", \n",
    "    \"metric\" : \"binary_error\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training until validation scores don't improve for 200 rounds.\n",
      "[100]\tvalid_0's binary_error: 0.222997\n",
      "[200]\tvalid_0's binary_error: 0.194743\n",
      "[300]\tvalid_0's binary_error: 0.176495\n",
      "[400]\tvalid_0's binary_error: 0.160717\n",
      "[500]\tvalid_0's binary_error: 0.149704\n",
      "[600]\tvalid_0's binary_error: 0.140322\n",
      "[700]\tvalid_0's binary_error: 0.133957\n",
      "[800]\tvalid_0's binary_error: 0.126753\n",
      "[900]\tvalid_0's binary_error: 0.121827\n",
      "[1000]\tvalid_0's binary_error: 0.117663\n",
      "[1100]\tvalid_0's binary_error: 0.113592\n",
      "[1200]\tvalid_0's binary_error: 0.110167\n",
      "[1300]\tvalid_0's binary_error: 0.106904\n",
      "[1400]\tvalid_0's binary_error: 0.103817\n",
      "[1500]\tvalid_0's binary_error: 0.10127\n",
      "[1600]\tvalid_0's binary_error: 0.0989456\n",
      "[1700]\tvalid_0's binary_error: 0.0967136\n",
      "[1800]\tvalid_0's binary_error: 0.0944201\n",
      "[1900]\tvalid_0's binary_error: 0.0925806\n",
      "[2000]\tvalid_0's binary_error: 0.0911953\n",
      "[2100]\tvalid_0's binary_error: 0.0895713\n",
      "[2200]\tvalid_0's binary_error: 0.0882244\n",
      "[2300]\tvalid_0's binary_error: 0.0871854\n",
      "[2400]\tvalid_0's binary_error: 0.0861695\n",
      "[2500]\tvalid_0's binary_error: 0.0854306\n",
      "[2600]\tvalid_0's binary_error: 0.0842531\n",
      "[2700]\tvalid_0's binary_error: 0.0829831\n",
      "[2800]\tvalid_0's binary_error: 0.0818979\n",
      "[2900]\tvalid_0's binary_error: 0.0812745\n",
      "[3000]\tvalid_0's binary_error: 0.0801047\n",
      "[3100]\tvalid_0's binary_error: 0.0795505\n",
      "[3200]\tvalid_0's binary_error: 0.0788348\n",
      "[3300]\tvalid_0's binary_error: 0.0781806\n",
      "[3400]\tvalid_0's binary_error: 0.077434\n",
      "[3500]\tvalid_0's binary_error: 0.0769106\n",
      "[3600]\tvalid_0's binary_error: 0.0762026\n",
      "[3700]\tvalid_0's binary_error: 0.0754791\n",
      "[3800]\tvalid_0's binary_error: 0.0748634\n",
      "[3900]\tvalid_0's binary_error: 0.0743785\n",
      "[4000]\tvalid_0's binary_error: 0.0738013\n",
      "[4100]\tvalid_0's binary_error: 0.0732317\n",
      "[4200]\tvalid_0's binary_error: 0.072693\n",
      "[4300]\tvalid_0's binary_error: 0.0720465\n",
      "[4400]\tvalid_0's binary_error: 0.0715616\n",
      "[4500]\tvalid_0's binary_error: 0.0712999\n",
      "[4600]\tvalid_0's binary_error: 0.0709844\n",
      "[4700]\tvalid_0's binary_error: 0.0704764\n",
      "[4800]\tvalid_0's binary_error: 0.0701916\n",
      "[4900]\tvalid_0's binary_error: 0.0699992\n",
      "[5000]\tvalid_0's binary_error: 0.0696221\n",
      "[5100]\tvalid_0's binary_error: 0.0693758\n",
      "[5200]\tvalid_0's binary_error: 0.0689063\n",
      "[5300]\tvalid_0's binary_error: 0.0686062\n",
      "[5400]\tvalid_0's binary_error: 0.0682983\n",
      "[5500]\tvalid_0's binary_error: 0.0678981\n",
      "[5600]\tvalid_0's binary_error: 0.0676672\n",
      "[5700]\tvalid_0's binary_error: 0.0675672\n",
      "[5800]\tvalid_0's binary_error: 0.0672285\n",
      "[5900]\tvalid_0's binary_error: 0.0670823\n",
      "[6000]\tvalid_0's binary_error: 0.0668129\n",
      "[6100]\tvalid_0's binary_error: 0.0665666\n",
      "[6200]\tvalid_0's binary_error: 0.0663742\n",
      "[6300]\tvalid_0's binary_error: 0.0661587\n",
      "[6400]\tvalid_0's binary_error: 0.0659509\n",
      "[6500]\tvalid_0's binary_error: 0.06572\n",
      "[6600]\tvalid_0's binary_error: 0.0655353\n",
      "[6700]\tvalid_0's binary_error: 0.065289\n",
      "[6800]\tvalid_0's binary_error: 0.0652197\n",
      "[6900]\tvalid_0's binary_error: 0.0648272\n",
      "[7000]\tvalid_0's binary_error: 0.064681\n",
      "[7100]\tvalid_0's binary_error: 0.0644347\n",
      "[7200]\tvalid_0's binary_error: 0.0641884\n",
      "[7300]\tvalid_0's binary_error: 0.064073\n",
      "[7400]\tvalid_0's binary_error: 0.0639421\n",
      "[7500]\tvalid_0's binary_error: 0.0636266\n",
      "[7600]\tvalid_0's binary_error: 0.0636881\n",
      "Early stopping, best iteration is:\n",
      "[7467]\tvalid_0's binary_error: 0.0635573\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<lightgbm.basic.Booster at 0x1162cb470>"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lgb.train(params, train_set=train_data, valid_sets=[valid_data], early_stopping_rounds=200, num_boost_round=10000, verbose_eval=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9364427"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "1 - 0.0635573"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### standardized X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_st = X - X.min(axis=1, keepdims=True)\n",
    "X_st = X_st / X_st.max(axis=1, keepdims=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = lgb.Dataset(X_st[:train_size, :], y[:train_size])\n",
    "valid_data = lgb.Dataset(X_st[train_size:, :], y[train_size:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {\n",
    "    \"objective\" : \"binary\", \n",
    "    \"metric\" : \"binary_error\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training until validation scores don't improve for 200 rounds.\n",
      "[100]\tvalid_0's binary_error: 0.167321\n",
      "[200]\tvalid_0's binary_error: 0.131663\n",
      "[300]\tvalid_0's binary_error: 0.112376\n",
      "[400]\tvalid_0's binary_error: 0.0990841\n",
      "[500]\tvalid_0's binary_error: 0.0889633\n",
      "[600]\tvalid_0's binary_error: 0.0815901\n",
      "[700]\tvalid_0's binary_error: 0.0758639\n",
      "[800]\tvalid_0's binary_error: 0.0709382\n",
      "[900]\tvalid_0's binary_error: 0.06719\n",
      "[1000]\tvalid_0's binary_error: 0.0637959\n",
      "[1100]\tvalid_0's binary_error: 0.060725\n",
      "[1200]\tvalid_0's binary_error: 0.0582544\n",
      "[1300]\tvalid_0's binary_error: 0.0558532\n",
      "[1400]\tvalid_0's binary_error: 0.0541138\n",
      "[1500]\tvalid_0's binary_error: 0.0519895\n",
      "[1600]\tvalid_0's binary_error: 0.0506657\n",
      "[1700]\tvalid_0's binary_error: 0.0493189\n",
      "[1800]\tvalid_0's binary_error: 0.0479797\n",
      "[1900]\tvalid_0's binary_error: 0.0466559\n",
      "[2000]\tvalid_0's binary_error: 0.0456861\n",
      "[2100]\tvalid_0's binary_error: 0.0447395\n",
      "[2200]\tvalid_0's binary_error: 0.0436851\n",
      "[2300]\tvalid_0's binary_error: 0.043054\n",
      "[2400]\tvalid_0's binary_error: 0.0425306\n",
      "[2500]\tvalid_0's binary_error: 0.0418841\n",
      "[2600]\tvalid_0's binary_error: 0.0412376\n",
      "[2700]\tvalid_0's binary_error: 0.0406373\n",
      "[2800]\tvalid_0's binary_error: 0.0401678\n",
      "[2900]\tvalid_0's binary_error: 0.0396829\n",
      "[3000]\tvalid_0's binary_error: 0.0390903\n",
      "[3100]\tvalid_0's binary_error: 0.0388132\n",
      "[3200]\tvalid_0's binary_error: 0.0385284\n",
      "[3300]\tvalid_0's binary_error: 0.0379897\n",
      "[3400]\tvalid_0's binary_error: 0.0377049\n",
      "[3500]\tvalid_0's binary_error: 0.037197\n",
      "[3600]\tvalid_0's binary_error: 0.0369738\n",
      "[3700]\tvalid_0's binary_error: 0.0366813\n",
      "[3800]\tvalid_0's binary_error: 0.0365351\n",
      "[3900]\tvalid_0's binary_error: 0.0363657\n",
      "[4000]\tvalid_0's binary_error: 0.0360194\n",
      "[4100]\tvalid_0's binary_error: 0.0356961\n",
      "[4200]\tvalid_0's binary_error: 0.0355499\n",
      "[4300]\tvalid_0's binary_error: 0.0354422\n",
      "[4400]\tvalid_0's binary_error: 0.0352036\n",
      "[4500]\tvalid_0's binary_error: 0.0351266\n",
      "[4600]\tvalid_0's binary_error: 0.0350112\n",
      "[4700]\tvalid_0's binary_error: 0.0348649\n",
      "[4800]\tvalid_0's binary_error: 0.0348187\n",
      "[4900]\tvalid_0's binary_error: 0.0345263\n",
      "[5000]\tvalid_0's binary_error: 0.0345032\n",
      "[5100]\tvalid_0's binary_error: 0.0343339\n",
      "[5200]\tvalid_0's binary_error: 0.0343647\n",
      "[5300]\tvalid_0's binary_error: 0.0342646\n",
      "[5400]\tvalid_0's binary_error: 0.0340106\n",
      "[5500]\tvalid_0's binary_error: 0.0338798\n",
      "[5600]\tvalid_0's binary_error: 0.033926\n",
      "[5700]\tvalid_0's binary_error: 0.0337412\n",
      "[5800]\tvalid_0's binary_error: 0.0336874\n",
      "[5900]\tvalid_0's binary_error: 0.033595\n",
      "[6000]\tvalid_0's binary_error: 0.0335027\n",
      "[6100]\tvalid_0's binary_error: 0.0333487\n",
      "[6200]\tvalid_0's binary_error: 0.0333333\n",
      "[6300]\tvalid_0's binary_error: 0.0332102\n",
      "[6400]\tvalid_0's binary_error: 0.0331178\n",
      "[6500]\tvalid_0's binary_error: 0.0330101\n",
      "[6600]\tvalid_0's binary_error: 0.032987\n",
      "[6700]\tvalid_0's binary_error: 0.0329485\n",
      "[6800]\tvalid_0's binary_error: 0.0329716\n",
      "[6900]\tvalid_0's binary_error: 0.0328638\n",
      "[7000]\tvalid_0's binary_error: 0.0326483\n",
      "[7100]\tvalid_0's binary_error: 0.0326099\n",
      "[7200]\tvalid_0's binary_error: 0.0325945\n",
      "Early stopping, best iteration is:\n",
      "[7037]\tvalid_0's binary_error: 0.0324559\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<lightgbm.basic.Booster at 0x1162b5b00>"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lgb.train(params, train_set=train_data, valid_sets=[valid_data], early_stopping_rounds=200, num_boost_round=10000, verbose_eval=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9675441"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "1 - 0.0324559"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sample Data 2\n",
    "\n",
    "Now we sample train and valid data in time-series manner and check the Accuracy.  \n",
    "For train data, we randomly pick grid and randomly pick year 0-54(start from 1901-1955), 375K in total.  \n",
    "For valid data, we randomly pick grid and randomly pick year 64-76(start from 1965-1977), 375K in total.  \n",
    "train data (both X and y) do not include 1995- data.  \n",
    "valid_y consists of 1995- data.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_size = 375000\n",
    "\n",
    "np.random.seed(71)\n",
    "train_sample_indices = np.random.choice(67420*55, train_size, replace=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_size = 125000\n",
    "\n",
    "np.random.seed(71)\n",
    "valid_sample_indices = np.random.choice(67420*13, valid_size, replace=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X = np.zeros((train_size, 360))\n",
    "train_y = np.zeros(train_size).astype(\"int\")\n",
    "\n",
    "for i in range(train_size):\n",
    "    sample_ind = train_sample_indices[i]\n",
    "    grid_ind = sample_ind // 55\n",
    "    year_ind = sample_ind % 55\n",
    "    train_X[i, :] = data_live[grid_ind, (year_ind*12):(year_ind*12+360)]\n",
    "    mean30 = train_X[i, :].mean()\n",
    "    mean10 = data_live[grid_ind, (year_ind*12+360):(year_ind*12+480)].mean()\n",
    "    if mean10 > mean30:\n",
    "        train_y[i] = 1\n",
    "    else:\n",
    "        train_y[i] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_X = np.zeros((valid_size, 360))\n",
    "valid_y = np.zeros(valid_size).astype(\"int\")\n",
    "\n",
    "for i in range(valid_size):\n",
    "    sample_ind = valid_sample_indices[i]\n",
    "    grid_ind = sample_ind // 13\n",
    "    year_ind = sample_ind % 13 + 64\n",
    "    valid_X[i, :] = data_live[grid_ind, (year_ind*12):(year_ind*12+360)]\n",
    "    mean30 = valid_X[i, :].mean()\n",
    "    mean10 = data_live[grid_ind, (year_ind*12+360):(year_ind*12+480)].mean()\n",
    "    if mean10 > mean30:\n",
    "        valid_y[i] = 1\n",
    "    else:\n",
    "        valid_y[i] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(162195, 212805, 2613, 122387)"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(1-train_y).sum(), train_y.sum(), (1-valid_y).sum(), valid_y.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.56748, 0.979096)"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_y.mean(), valid_y.mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "we see Global warming here...  \n",
    "all 1 model has 98% accuracy.  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### raw X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = lgb.Dataset(train_X, train_y)\n",
    "valid_data = lgb.Dataset(valid_X, valid_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {\n",
    "    \"objective\" : \"binary\", \n",
    "    \"metric\" : \"binary_error\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training until validation scores don't improve for 200 rounds.\n",
      "[100]\tvalid_0's binary_error: 0.145256\n",
      "[200]\tvalid_0's binary_error: 0.141512\n",
      "[300]\tvalid_0's binary_error: 0.140848\n",
      "[400]\tvalid_0's binary_error: 0.143792\n",
      "Early stopping, best iteration is:\n",
      "[288]\tvalid_0's binary_error: 0.139184\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<lightgbm.basic.Booster at 0x1162cbc18>"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lgb.train(params, train_set=train_data, valid_sets=[valid_data], early_stopping_rounds=200, num_boost_round=10000, verbose_eval=100)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### standardized X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X_st = train_X - train_X.min(axis=1, keepdims=True)\n",
    "train_X_st = train_X_st / train_X_st.max(axis=1, keepdims=True)\n",
    "valid_X_st = valid_X - valid_X.min(axis=1, keepdims=True)\n",
    "valid_X_st = valid_X_st / valid_X_st.max(axis=1, keepdims=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = lgb.Dataset(train_X_st, train_y)\n",
    "valid_data = lgb.Dataset(valid_X_st, valid_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {\n",
    "    \"objective\" : \"binary\", \n",
    "    \"metric\" : \"binary_error\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training until validation scores don't improve for 200 rounds.\n",
      "[100]\tvalid_0's binary_error: 0.112464\n",
      "[200]\tvalid_0's binary_error: 0.114136\n",
      "[300]\tvalid_0's binary_error: 0.117384\n",
      "Early stopping, best iteration is:\n",
      "[119]\tvalid_0's binary_error: 0.111232\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<lightgbm.basic.Booster at 0x1162cb080>"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lgb.train(params, train_set=train_data, valid_sets=[valid_data], early_stopping_rounds=200, num_boost_round=10000, verbose_eval=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}