{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Web Traffic Time Series Forecasting - DNN\n", "=========================================\n", "\n", "* [I. Prepare Data](#prepare-data)\n", "* [II. Feature Engineering](#feature-engineering)\n", "* [III. Model Training and Prediction](#model-training-prediction)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import re\n", "from tqdm import tqdm\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.model_selection import KFold\n", "\n", "import tensorflow as tf\n", "from tensorflow import keras\n", "import tensorflow.keras.backend as K\n", "\n", "pd.options.display.max_columns = None\n", "\n", "import os\n", "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' # TF INFO not printed" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "data_input_dir = \"data_input/\"\n", "train_csv = data_input_dir + \"train_2.csv\"\n", "test_key_csv = data_input_dir + \"key_2.csv\"\n", "submission_csv = \"./submission.csv\"\n", "\n", "pred_horizon = 62\n", "lookback_weeks = 16 # The look back weeks for computing the median features." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "np.random.seed = 0\n", "tf.random.set_seed(42)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# I. Prepare Data " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def load_data(train_csv: str) -> tuple:\n", " \"\"\"\n", " Read data from CSV and split page column from views time series.\n", " \"\"\"\n", " train = pd.read_csv(train_csv)\n", " page = train['Page'].copy()\n", " views = train.iloc[:, 1:]\n", " return page, views\n", "\n", "def prepare_data(page: pd.DataFrame, views: pd.DataFrame, lookback_weeks: int) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, int]:\n", " \"\"\"\n", " - Split Page column to meta features.\n", " - Split days to train and test sets for 2016 and 2017. Modeling will take advantage of yearly seasonality, 2016 for training, 2017 for test prediction.\n", " X_train: 2016-03-14 ~ 2016-09-10\n", " y_train: 2016-09-13 ~ 2016-11-14, 9 weeks (1 more day than required)\n", " X_test: 2017-03-14 ~ 2017-09-10\n", " y_test: 2017-09-13 ~ 2017-11-13\n", " \"\"\"\n", " meta = page.str.rsplit('_', n=3, expand=True)\n", " meta.columns = ['title', 'site', 'access', 'agent']\n", " meta = pd.concat([page, meta['site'], meta[['access', 'agent']].agg('_'.join, axis=1)], axis=1)\n", " meta.columns = ['Page', 'Site', 'AccessAgent']\n", "\n", " lookback_days = lookback_weeks * 7\n", " X_train_end_idx = views.columns.get_loc('2016-09-10') + 1 # cut off at 09/10 as train_2 data stops at 2017/09/10, so we can use yearly seasonality\n", " X_train = views.iloc[:, (X_train_end_idx - lookback_days) : X_train_end_idx].iloc[:, ::-1]\n", " X_train = np.log1p(X_train)\n", "\n", " y_train_start_idx = views.columns.get_loc('2016-09-13')\n", " y_train = views.iloc[:, y_train_start_idx : (y_train_start_idx + pred_horizon + 1)].fillna(0)#.astype('float32').copy()\n", "\n", " X_test = views.iloc[:, -lookback_days:].iloc[:, ::-1]\n", " X_test = np.log1p(X_test)\n", "\n", " return meta, X_train, y_train, X_test\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "page, views = load_data(train_csv)\n", "meta, X_visits_train, y_visits_train, X_visits_test = prepare_data(page, views, lookback_weeks)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# II. Feature Engineering " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "\n", "def add_median(X, y=None, n_weeks: int=16):\n", " \"\"\"\n", " Features: weekly median of log1p(visits) - n_weeks median of log1p(visits), for the latest n_weeks//2 weeks.\n", " Labels: log1p(visits) - n_weeks median of log1p(visits)\n", " \"\"\"\n", " X_median_all = X.iloc[:, : 7 * n_weeks].median(axis=1).fillna(0).values.reshape(-1, 1)\n", "\n", " n_features = n_weeks // 2\n", " X_medians = np.empty((X.shape[0], n_features))\n", " for i in range(n_features):\n", " X_medians[:, i] = X.iloc[:, i*7 : (i+1)*7].median(axis=1, skipna=True).values\n", " X_medians = np.nan_to_num(X_medians - X_median_all, nan=0.)\n", "\n", " if y is not None:\n", " y_medians = np.nan_to_num(np.log1p(y.values) - X_median_all, nan=0.)\n", " else:\n", " y_medians = None\n", "\n", " return X_medians, y_medians, X_median_all\n", "\n", "def one_hot_encode(valid):\n", " onehot_encoder = OneHotEncoder()\n", " site_access_enc = onehot_encoder.fit_transform(valid[['Site', 'AccessAgent']]).toarray()\n", " return site_access_enc\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# add median\n", "X_medians_train, y_train, X_medians_all_train = add_median(X_visits_train, y_visits_train, lookback_weeks)\n", "X_medians_test, _, X_medians_all_test = add_median(X_visits_test, None, lookback_weeks)\n", "\n", "# one-hot encode category variables\n", "X_cat = one_hot_encode(meta)\n", "\n", "# Combine numerical and categorical features\n", "X_train = np.c_[X_medians_train, X_cat]\n", "X_test = np.c_[X_medians_test, X_cat]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# III. Model Training and Prediction " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def smape(y_true, y_pred):\n", " \"\"\"\n", " Compute the SMAPE metric. Input could be >1D.\n", " \"\"\"\n", " y_true, y_pred = np.ravel(y_true), np.ravel(y_pred)\n", " denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0\n", " ape = np.abs(y_true - y_pred) / denom # absolute percentage error before averaging\n", " ape[denom == 0.0] = 0.0 # Replace NaN with 0.0 in case div by 0\n", " mape = np.nanmean(ape) # Mean of APE\n", " return mape\n", "\n", "def clipped_mae_loss(y_true, y_pred):\n", " \"\"\"\n", " Mean absolute error clipped, used as training loss.\n", " \"\"\"\n", " return K.mean(K.clip(K.abs(y_pred - y_true), 0., 1.), axis=-1)\n", "\n", "def build_one_dnn(dim_X, dim_y, dropout_rate=0.5, C=0.00004):\n", " \"\"\"\n", " Build one DNN model with a skip connection.\n", " \"\"\"\n", " # Input tensor\n", " input_tensor = keras.Input(shape=(dim_X,))\n", "\n", " # hidden layer 1\n", " hidden1 = keras.layers.Dense(\n", " 200, activation='relu',\n", " kernel_initializer='lecun_uniform',\n", " kernel_regularizer=keras.regularizers.L2(C)\n", " )(input_tensor)\n", " hidden1 = keras.layers.Dropout(dropout_rate)(hidden1)\n", "\n", " # Wide concatenation\n", " concat = keras.layers.Concatenate()([input_tensor, hidden1])\n", "\n", " # hidden layer 2 with batch normalization\n", " hidden2 = keras.layers.Dense(\n", " 200, activation='relu',\n", " kernel_initializer='lecun_uniform',\n", " kernel_regularizer=keras.regularizers.L2(C)\n", " )(concat)\n", " hidden2 = keras.layers.BatchNormalization(\n", " beta_regularizer=keras.regularizers.L2(C),\n", " gamma_regularizer=keras.regularizers.L2(C)\n", " )(hidden2)\n", " hidden2 = keras.layers.Dropout(dropout_rate)(hidden2)\n", "\n", " # hidden layer 3\n", " hidden3 = keras.layers.Dense(\n", " 100, activation='relu',\n", " kernel_initializer='lecun_uniform',\n", " kernel_regularizer=keras.regularizers.L2(C)\n", " )(hidden2)\n", " hidden3 = keras.layers.Dropout(dropout_rate)(hidden3)\n", "\n", " # hidden layer 4\n", " hidden4 = keras.layers.Dense(\n", " 200, activation='relu',\n", " kernel_initializer='lecun_uniform',\n", " kernel_regularizer=keras.regularizers.L2(C)\n", " )(hidden3)\n", " hidden4 = keras.layers.Dropout(dropout_rate)(hidden4)\n", "\n", " # output layer\n", " output = keras.layers.Dense(\n", " dim_y, activation='linear',\n", " kernel_initializer='lecun_uniform',\n", " kernel_regularizer=keras.regularizers.L2(C)\n", " )(hidden4)\n", " \n", " # generate model\n", " model = keras.Model(inputs=input_tensor, outputs=[output])\n", " model.compile(loss=clipped_mae_loss, optimizer='adam')\n", " return model\n", "\n", "def fit_predict(\n", " X_train, y_train, median_n_weeks_train, y_visits_train,\n", " X_test, median_n_weeks_test,\n", " n_bagging, n_rounds, epochs, batch_size):\n", " \"\"\"\n", " Train a bag of DNN models on K-fold train splits. The test predictions are the medians of individual model predictions.\n", " \"\"\"\n", " model_bag = [build_one_dnn(X_train.shape[1], y_train.shape[1]) for _ in range(n_bagging)]\n", " kfold = KFold(n_splits=n_bagging)\n", "\n", " y_test_pred = np.zeros((n_bagging, *y_train.shape)) # y_test y_train same shape[1]\n", " visits_test_pred = np.zeros(y_train.shape) # y_test y_train same shape[1]\n", " min_valid_loss = float('inf')\n", " for i in range(n_rounds):\n", " print(f\"Round {i}\", end=' ') \n", " y_pred = np.zeros(y_train.shape)\n", " for k, (train_index, test_index) in tqdm(\n", " enumerate(kfold.split(X_train, y_train)), total=n_bagging):\n", " X_train_k, y_train_k = X_train[train_index, :], y_train[train_index, :]\n", " X_test_k = X_train[test_index, :]\n", " model = model_bag[k]\n", " history = model.fit(X_train_k, y_train_k,\n", " #validation_data=[[X_train_median_diff[test_index, :], X_train_cat[test_index, :]], y_train[test_index, :]],\n", " epochs=epochs, batch_size=batch_size, verbose=0, shuffle=True)\n", " y_pred[test_index, :] = model.predict(X_test_k, batch_size=batch_size)\n", " _ = model.predict(X_test)\n", " \n", " visits_pred = np.expm1(y_pred + median_n_weeks_train)\n", " visits_pred[visits_pred < 0.5] = 0\n", " valid_loss = smape(y_visits_train, visits_pred)\n", " print(f\"{valid_loss = :.5f}\")\n", "\n", " # update test prediction if validation improves\n", " if valid_loss < min_valid_loss:\n", " for k in range(n_bagging):\n", " y_test_pred[k, :, :] = model_bag[k].predict(X_test, batch_size=batch_size)\n", " visits_test_pred = np.expm1(np.nanmedian(y_test_pred, axis=0) + median_n_weeks_test)\n", " visits_test_pred[visits_test_pred < 0.5] = 0\n", " \n", " return visits_test_pred" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2022-01-01 22:53:52.269441: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory\n", "2022-01-01 22:53:52.269460: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Round 0 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [04:38<00:00, 13.90s/it]\n", "/tmp/ipykernel_85912/1704954077.py:7: RuntimeWarning: invalid value encountered in true_divide\n", " ape = np.abs(y_true - y_pred) / denom # absolute percentage error before averaging\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.48138\n", "Round 1 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [04:24<00:00, 13.23s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.47140\n", "Round 2 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:41<00:00, 8.08s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.45864\n", "Round 3 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:22<00:00, 7.14s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.45100\n", "Round 4 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:17<00:00, 6.88s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44859\n", "Round 5 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:16<00:00, 6.83s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44747\n", "Round 6 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [26:45<00:00, 80.27s/it] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44657\n", "Round 7 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [03:09<00:00, 9.49s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44559\n", "Round 8 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:22<00:00, 7.10s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44567\n", "Round 9 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:18<00:00, 6.93s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44559\n", "Round 10 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:16<00:00, 6.82s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44496\n", "Round 11 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:16<00:00, 6.83s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44496\n", "Round 12 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:26<00:00, 7.34s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44466\n", "Round 13 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:21<00:00, 7.10s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44460\n", "Round 14 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:23<00:00, 7.16s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44462\n", "Round 15 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:23<00:00, 7.16s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44425\n", "Round 16 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:23<00:00, 7.15s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44424\n", "Round 17 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:17<00:00, 6.90s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44412\n", "Round 18 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:17<00:00, 6.88s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44441\n", "Round 19 " ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [02:17<00:00, 6.88s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "valid_loss = 0.44431\n" ] } ], "source": [ "n_bagging = 20\n", "n_rounds = 20\n", "epochs=10\n", "batch_size = 4096\n", "\n", "visits_test_pred = fit_predict(\n", " X_train, y_train, X_medians_all_train, y_visits_train,\n", " X_test, X_medians_all_test,\n", " n_bagging, n_rounds=n_rounds, epochs=epochs, batch_size=batch_size)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# make submission\n", "test_key = pd.read_csv(test_key_csv)\n", "test_key_split = test_key['Page'].str.rsplit('_', n=1, expand=True)\n", "test_key_split.columns = ['Page', 'Date']\n", "test_key = pd.concat([test_key_split, test_key[['Id']]], axis=1)\n", "\n", "test_dates = sorted(test_key['Date'].unique())\n", "visits_test_pred_df = pd.DataFrame(visits_test_pred[:, 1:], columns=test_dates)\n", "visits_test_pred_df = pd.concat([visits_test_pred_df, meta[['Page']]], axis=1)\n", "visits_test_pred_df = pd.melt(visits_test_pred_df, id_vars=['Page'], var_name='Date', value_name='Visits')\n", "submission_df = visits_test_pred_df.merge(test_key, how='left', on=['Page', 'Date'])[['Id', 'Visits']]\n", "submission_df.to_csv(submission_csv, index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Score: 37.73212 (at 11th place)" ] } ], "metadata": { "interpreter": { "hash": "f6b0d3faf30ba3d9ead12cafe30eabbf1d71a53f9577af15dc3e1a12414ee7e5" }, "kernelspec": { "display_name": "Python 3.9.7 64-bit ('ml': conda)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }