{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Web Traffic Time Series Forecasting - DNN\n",
"=========================================\n",
"\n",
"* [I. Prepare Data](#prepare-data)\n",
"* [II. Feature Engineering](#feature-engineering)\n",
"* [III. Model Training and Prediction](#model-training-prediction)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from tqdm import tqdm\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.model_selection import KFold\n",
"\n",
"import tensorflow as tf\n",
"from tensorflow import keras\n",
"import tensorflow.keras.backend as K\n",
"\n",
"pd.options.display.max_columns = None\n",
"\n",
"import os\n",
"os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' # TF INFO not printed"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"data_input_dir = \"data_input/\"\n",
"train_csv = data_input_dir + \"train_2.csv\"\n",
"test_key_csv = data_input_dir + \"key_2.csv\"\n",
"submission_csv = \"./submission.csv\"\n",
"\n",
"pred_horizon = 62\n",
"lookback_weeks = 16 # The look back weeks for computing the median features."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"np.random.seed = 0\n",
"tf.random.set_seed(42)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# I. Prepare Data "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def load_data(train_csv: str) -> tuple:\n",
" \"\"\"\n",
" Read data from CSV and split page column from views time series.\n",
" \"\"\"\n",
" train = pd.read_csv(train_csv)\n",
" page = train['Page'].copy()\n",
" views = train.iloc[:, 1:]\n",
" return page, views\n",
"\n",
"def prepare_data(page: pd.DataFrame, views: pd.DataFrame, lookback_weeks: int) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, int]:\n",
" \"\"\"\n",
" - Split Page column to meta features.\n",
" - Split days to train and test sets for 2016 and 2017. Modeling will take advantage of yearly seasonality, 2016 for training, 2017 for test prediction.\n",
" X_train: 2016-03-14 ~ 2016-09-10\n",
" y_train: 2016-09-13 ~ 2016-11-14, 9 weeks (1 more day than required)\n",
" X_test: 2017-03-14 ~ 2017-09-10\n",
" y_test: 2017-09-13 ~ 2017-11-13\n",
" \"\"\"\n",
" meta = page.str.rsplit('_', n=3, expand=True)\n",
" meta.columns = ['title', 'site', 'access', 'agent']\n",
" meta = pd.concat([page, meta['site'], meta[['access', 'agent']].agg('_'.join, axis=1)], axis=1)\n",
" meta.columns = ['Page', 'Site', 'AccessAgent']\n",
"\n",
" lookback_days = lookback_weeks * 7\n",
" X_train_end_idx = views.columns.get_loc('2016-09-10') + 1 # cut off at 09/10 as train_2 data stops at 2017/09/10, so we can use yearly seasonality\n",
" X_train = views.iloc[:, (X_train_end_idx - lookback_days) : X_train_end_idx].iloc[:, ::-1]\n",
" X_train = np.log1p(X_train)\n",
"\n",
" y_train_start_idx = views.columns.get_loc('2016-09-13')\n",
" y_train = views.iloc[:, y_train_start_idx : (y_train_start_idx + pred_horizon + 1)].fillna(0)#.astype('float32').copy()\n",
"\n",
" X_test = views.iloc[:, -lookback_days:].iloc[:, ::-1]\n",
" X_test = np.log1p(X_test)\n",
"\n",
" return meta, X_train, y_train, X_test\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"page, views = load_data(train_csv)\n",
"meta, X_visits_train, y_visits_train, X_visits_test = prepare_data(page, views, lookback_weeks)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# II. Feature Engineering "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def add_median(X, y=None, n_weeks: int=16):\n",
" \"\"\"\n",
" Features: weekly median of log1p(visits) - n_weeks median of log1p(visits), for the latest n_weeks//2 weeks.\n",
" Labels: log1p(visits) - n_weeks median of log1p(visits)\n",
" \"\"\"\n",
" X_median_all = X.iloc[:, : 7 * n_weeks].median(axis=1).fillna(0).values.reshape(-1, 1)\n",
"\n",
" n_features = n_weeks // 2\n",
" X_medians = np.empty((X.shape[0], n_features))\n",
" for i in range(n_features):\n",
" X_medians[:, i] = X.iloc[:, i*7 : (i+1)*7].median(axis=1, skipna=True).values\n",
" X_medians = np.nan_to_num(X_medians - X_median_all, nan=0.)\n",
"\n",
" if y is not None:\n",
" y_medians = np.nan_to_num(np.log1p(y.values) - X_median_all, nan=0.)\n",
" else:\n",
" y_medians = None\n",
"\n",
" return X_medians, y_medians, X_median_all\n",
"\n",
"def one_hot_encode(valid):\n",
" onehot_encoder = OneHotEncoder()\n",
" site_access_enc = onehot_encoder.fit_transform(valid[['Site', 'AccessAgent']]).toarray()\n",
" return site_access_enc\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# add median\n",
"X_medians_train, y_train, X_medians_all_train = add_median(X_visits_train, y_visits_train, lookback_weeks)\n",
"X_medians_test, _, X_medians_all_test = add_median(X_visits_test, None, lookback_weeks)\n",
"\n",
"# one-hot encode category variables\n",
"X_cat = one_hot_encode(meta)\n",
"\n",
"# Combine numerical and categorical features\n",
"X_train = np.c_[X_medians_train, X_cat]\n",
"X_test = np.c_[X_medians_test, X_cat]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# III. Model Training and Prediction "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def smape(y_true, y_pred):\n",
" \"\"\"\n",
" Compute the SMAPE metric. Input could be >1D.\n",
" \"\"\"\n",
" y_true, y_pred = np.ravel(y_true), np.ravel(y_pred)\n",
" denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0\n",
" ape = np.abs(y_true - y_pred) / denom # absolute percentage error before averaging\n",
" ape[denom == 0.0] = 0.0 # Replace NaN with 0.0 in case div by 0\n",
" mape = np.nanmean(ape) # Mean of APE\n",
" return mape\n",
"\n",
"def clipped_mae_loss(y_true, y_pred):\n",
" \"\"\"\n",
" Mean absolute error clipped, used as training loss.\n",
" \"\"\"\n",
" return K.mean(K.clip(K.abs(y_pred - y_true), 0., 1.), axis=-1)\n",
"\n",
"def build_one_dnn(dim_X, dim_y, dropout_rate=0.5, C=0.00004):\n",
" \"\"\"\n",
" Build one DNN model with a skip connection.\n",
" \"\"\"\n",
" # Input tensor\n",
" input_tensor = keras.Input(shape=(dim_X,))\n",
"\n",
" # hidden layer 1\n",
" hidden1 = keras.layers.Dense(\n",
" 200, activation='relu',\n",
" kernel_initializer='lecun_uniform',\n",
" kernel_regularizer=keras.regularizers.L2(C)\n",
" )(input_tensor)\n",
" hidden1 = keras.layers.Dropout(dropout_rate)(hidden1)\n",
"\n",
" # Wide concatenation\n",
" concat = keras.layers.Concatenate()([input_tensor, hidden1])\n",
"\n",
" # hidden layer 2 with batch normalization\n",
" hidden2 = keras.layers.Dense(\n",
" 200, activation='relu',\n",
" kernel_initializer='lecun_uniform',\n",
" kernel_regularizer=keras.regularizers.L2(C)\n",
" )(concat)\n",
" hidden2 = keras.layers.BatchNormalization(\n",
" beta_regularizer=keras.regularizers.L2(C),\n",
" gamma_regularizer=keras.regularizers.L2(C)\n",
" )(hidden2)\n",
" hidden2 = keras.layers.Dropout(dropout_rate)(hidden2)\n",
"\n",
" # hidden layer 3\n",
" hidden3 = keras.layers.Dense(\n",
" 100, activation='relu',\n",
" kernel_initializer='lecun_uniform',\n",
" kernel_regularizer=keras.regularizers.L2(C)\n",
" )(hidden2)\n",
" hidden3 = keras.layers.Dropout(dropout_rate)(hidden3)\n",
"\n",
" # hidden layer 4\n",
" hidden4 = keras.layers.Dense(\n",
" 200, activation='relu',\n",
" kernel_initializer='lecun_uniform',\n",
" kernel_regularizer=keras.regularizers.L2(C)\n",
" )(hidden3)\n",
" hidden4 = keras.layers.Dropout(dropout_rate)(hidden4)\n",
"\n",
" # output layer\n",
" output = keras.layers.Dense(\n",
" dim_y, activation='linear',\n",
" kernel_initializer='lecun_uniform',\n",
" kernel_regularizer=keras.regularizers.L2(C)\n",
" )(hidden4)\n",
" \n",
" # generate model\n",
" model = keras.Model(inputs=input_tensor, outputs=[output])\n",
" model.compile(loss=clipped_mae_loss, optimizer='adam')\n",
" return model\n",
"\n",
"def fit_predict(\n",
" X_train, y_train, median_n_weeks_train, y_visits_train,\n",
" X_test, median_n_weeks_test,\n",
" n_bagging, n_rounds, epochs, batch_size):\n",
" \"\"\"\n",
" Train a bag of DNN models on K-fold train splits. The test predictions are the medians of individual model predictions.\n",
" \"\"\"\n",
" model_bag = [build_one_dnn(X_train.shape[1], y_train.shape[1]) for _ in range(n_bagging)]\n",
" kfold = KFold(n_splits=n_bagging)\n",
"\n",
" y_test_pred = np.zeros((n_bagging, *y_train.shape)) # y_test y_train same shape[1]\n",
" visits_test_pred = np.zeros(y_train.shape) # y_test y_train same shape[1]\n",
" min_valid_loss = float('inf')\n",
" for i in range(n_rounds):\n",
" print(f\"Round {i}\", end=' ') \n",
" y_pred = np.zeros(y_train.shape)\n",
" for k, (train_index, test_index) in tqdm(\n",
" enumerate(kfold.split(X_train, y_train)), total=n_bagging):\n",
" X_train_k, y_train_k = X_train[train_index, :], y_train[train_index, :]\n",
" X_test_k = X_train[test_index, :]\n",
" model = model_bag[k]\n",
" history = model.fit(X_train_k, y_train_k,\n",
" #validation_data=[[X_train_median_diff[test_index, :], X_train_cat[test_index, :]], y_train[test_index, :]],\n",
" epochs=epochs, batch_size=batch_size, verbose=0, shuffle=True)\n",
" y_pred[test_index, :] = model.predict(X_test_k, batch_size=batch_size)\n",
" _ = model.predict(X_test)\n",
" \n",
" visits_pred = np.expm1(y_pred + median_n_weeks_train)\n",
" visits_pred[visits_pred < 0.5] = 0\n",
" valid_loss = smape(y_visits_train, visits_pred)\n",
" print(f\"{valid_loss = :.5f}\")\n",
"\n",
" # update test prediction if validation improves\n",
" if valid_loss < min_valid_loss:\n",
" for k in range(n_bagging):\n",
" y_test_pred[k, :, :] = model_bag[k].predict(X_test, batch_size=batch_size)\n",
" visits_test_pred = np.expm1(np.nanmedian(y_test_pred, axis=0) + median_n_weeks_test)\n",
" visits_test_pred[visits_test_pred < 0.5] = 0\n",
" \n",
" return visits_test_pred"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-01-01 22:53:52.269441: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory\n",
"2022-01-01 22:53:52.269460: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Round 0 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [04:38<00:00, 13.90s/it]\n",
"/tmp/ipykernel_85912/1704954077.py:7: RuntimeWarning: invalid value encountered in true_divide\n",
" ape = np.abs(y_true - y_pred) / denom # absolute percentage error before averaging\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.48138\n",
"Round 1 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [04:24<00:00, 13.23s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.47140\n",
"Round 2 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:41<00:00, 8.08s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.45864\n",
"Round 3 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:22<00:00, 7.14s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.45100\n",
"Round 4 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:17<00:00, 6.88s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44859\n",
"Round 5 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:16<00:00, 6.83s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44747\n",
"Round 6 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [26:45<00:00, 80.27s/it] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44657\n",
"Round 7 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [03:09<00:00, 9.49s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44559\n",
"Round 8 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:22<00:00, 7.10s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44567\n",
"Round 9 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:18<00:00, 6.93s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44559\n",
"Round 10 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:16<00:00, 6.82s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44496\n",
"Round 11 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:16<00:00, 6.83s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44496\n",
"Round 12 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:26<00:00, 7.34s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44466\n",
"Round 13 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:21<00:00, 7.10s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44460\n",
"Round 14 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:23<00:00, 7.16s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44462\n",
"Round 15 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:23<00:00, 7.16s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44425\n",
"Round 16 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:23<00:00, 7.15s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44424\n",
"Round 17 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:17<00:00, 6.90s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44412\n",
"Round 18 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:17<00:00, 6.88s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44441\n",
"Round 19 "
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 20/20 [02:17<00:00, 6.88s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"valid_loss = 0.44431\n"
]
}
],
"source": [
"n_bagging = 20\n",
"n_rounds = 20\n",
"epochs=10\n",
"batch_size = 4096\n",
"\n",
"visits_test_pred = fit_predict(\n",
" X_train, y_train, X_medians_all_train, y_visits_train,\n",
" X_test, X_medians_all_test,\n",
" n_bagging, n_rounds=n_rounds, epochs=epochs, batch_size=batch_size)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# make submission\n",
"test_key = pd.read_csv(test_key_csv)\n",
"test_key_split = test_key['Page'].str.rsplit('_', n=1, expand=True)\n",
"test_key_split.columns = ['Page', 'Date']\n",
"test_key = pd.concat([test_key_split, test_key[['Id']]], axis=1)\n",
"\n",
"test_dates = sorted(test_key['Date'].unique())\n",
"visits_test_pred_df = pd.DataFrame(visits_test_pred[:, 1:], columns=test_dates)\n",
"visits_test_pred_df = pd.concat([visits_test_pred_df, meta[['Page']]], axis=1)\n",
"visits_test_pred_df = pd.melt(visits_test_pred_df, id_vars=['Page'], var_name='Date', value_name='Visits')\n",
"submission_df = visits_test_pred_df.merge(test_key, how='left', on=['Page', 'Date'])[['Id', 'Visits']]\n",
"submission_df.to_csv(submission_csv, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Score: 37.73212 (at 11th place)"
]
}
],
"metadata": {
"interpreter": {
"hash": "f6b0d3faf30ba3d9ead12cafe30eabbf1d71a53f9577af15dc3e1a12414ee7e5"
},
"kernelspec": {
"display_name": "Python 3.9.7 64-bit ('ml': conda)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}