In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
import plotly.graph_objects as go

# Data preparation

## Reading data

In [2]:
df_raw = pd.read_csv("train_2.csv")

In [3]:
df_raw.head()

Unnamed: 0,Page,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,...,2017-09-01,2017-09-02,2017-09-03,2017-09-04,2017-09-05,2017-09-06,2017-09-07,2017-09-08,2017-09-09,2017-09-10
0,2NE1_zh.wikipedia.org_all-access_spider,18.0,11.0,5.0,13.0,14.0,9.0,9.0,22.0,26.0,...,19.0,33.0,33.0,18.0,16.0,27.0,29.0,23.0,54.0,38.0
1,2PM_zh.wikipedia.org_all-access_spider,11.0,14.0,15.0,18.0,11.0,13.0,22.0,11.0,10.0,...,32.0,30.0,11.0,19.0,54.0,25.0,26.0,23.0,13.0,81.0
2,3C_zh.wikipedia.org_all-access_spider,1.0,0.0,1.0,1.0,0.0,4.0,0.0,3.0,4.0,...,6.0,6.0,7.0,2.0,4.0,7.0,3.0,4.0,7.0,6.0
3,4minute_zh.wikipedia.org_all-access_spider,35.0,13.0,10.0,94.0,4.0,26.0,14.0,9.0,11.0,...,7.0,19.0,19.0,9.0,6.0,16.0,19.0,30.0,38.0,4.0
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_s...,,,,,,,,,,...,16.0,16.0,19.0,9.0,20.0,23.0,28.0,14.0,8.0,7.0


In [4]:
df_raw.shape

(145063, 804)

## Cleaning data

We will use data of only one page

In [5]:
df_no_na = df_raw.dropna()
df_no_na.head()

Unnamed: 0,Page,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,...,2017-09-01,2017-09-02,2017-09-03,2017-09-04,2017-09-05,2017-09-06,2017-09-07,2017-09-08,2017-09-09,2017-09-10
0,2NE1_zh.wikipedia.org_all-access_spider,18.0,11.0,5.0,13.0,14.0,9.0,9.0,22.0,26.0,...,19.0,33.0,33.0,18.0,16.0,27.0,29.0,23.0,54.0,38.0
1,2PM_zh.wikipedia.org_all-access_spider,11.0,14.0,15.0,18.0,11.0,13.0,22.0,11.0,10.0,...,32.0,30.0,11.0,19.0,54.0,25.0,26.0,23.0,13.0,81.0
2,3C_zh.wikipedia.org_all-access_spider,1.0,0.0,1.0,1.0,0.0,4.0,0.0,3.0,4.0,...,6.0,6.0,7.0,2.0,4.0,7.0,3.0,4.0,7.0,6.0
3,4minute_zh.wikipedia.org_all-access_spider,35.0,13.0,10.0,94.0,4.0,26.0,14.0,9.0,11.0,...,7.0,19.0,19.0,9.0,6.0,16.0,19.0,30.0,38.0,4.0
5,5566_zh.wikipedia.org_all-access_spider,12.0,7.0,4.0,5.0,20.0,8.0,5.0,17.0,24.0,...,13.0,13.0,45.0,4.0,13.0,20.0,18.0,17.0,14.0,11.0


In [6]:
means = df_no_na.drop("Page", axis=1).mean(axis=1)
mins = df_no_na.drop("Page", axis=1).min(axis=1)
one_page = df_no_na[(means > 100) & (mins > 10)]["Page"].sample(1).values[0]
df_one_page = df_no_na[df_no_na["Page"] == one_page].drop("Page", axis=1)
df_one_page.head()

Unnamed: 0,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,2015-07-10,...,2017-09-01,2017-09-02,2017-09-03,2017-09-04,2017-09-05,2017-09-06,2017-09-07,2017-09-08,2017-09-09,2017-09-10
71123,1348.0,1448.0,881.0,547.0,658.0,951.0,1228.0,1157.0,1162.0,866.0,...,2155.0,1274.0,1715.0,2758.0,3151.0,2991.0,2637.0,1527.0,931.0,1146.0


## Windows

In [7]:
window_size = 15
cols = df_one_page.columns
window_data = []
for start_index in range(len(cols) - window_size + 1):
    window_data.append(df_one_page[cols[start_index:start_index + window_size]].values[0])

In [8]:
len(window_data)

789

## Split between train and test set 

In [9]:
test_start_index = int(0.85 * len(window_data))
train_data = np.array(window_data[:test_start_index])
test_data = np.array(window_data[test_start_index:])

In [10]:
x_train = train_data[:, :-1].reshape(len(train_data), window_size - 1, 1)
y_train = train_data[:, -1]
x_test = test_data[:, :-1].reshape(len(test_data), window_size - 1, 1)
y_test = test_data[:, -1]

In [11]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((670, 14, 1), (670,), (119, 14, 1), (119,))

# Apply LSTM

## LSTM without global scaling

In [12]:
def get_model(x_train):
    input_layer = tf.keras.layers.Input((x_train.shape[1], x_train.shape[2]))
    lstm_layer = tf.keras.layers.LSTM(128, return_sequences=True)(input_layer)
    second_lstm_layer = tf.keras.layers.LSTM(32)(lstm_layer)
    dense_layer = tf.keras.layers.Dense(8)(second_lstm_layer)
    output_layer = tf.keras.layers.Dense(1)(dense_layer)
    model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss="mae")
    return model

In [13]:
model = get_model(x_train)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=100,
                                                 restore_best_weights=True)
model_history = model.fit(x_train, y_train, validation_split=0.1,
                         batch_size=128, epochs=1000,
                        callbacks=early_stopping, verbose=0)

In [14]:
def visualize_loss(history, title):
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    epochs = list(range(len(loss)))
    fig = go.Figure(data=[go.Scatter(x=epochs, y=loss, name="Training loss"),
                   go.Scatter(x=epochs, y=val_loss, name="Validation loss")])
    fig.update_layout(title=title,
                       xaxis_title="Epoch",
                       yaxis_title="Loss")
    fig.show()

In [15]:
visualize_loss(model_history, 
               "Training of a LSTM model without scaling");

The training seems to be quite successful, but we will see at the end of the notebook that it's not really the case...

In [16]:
preds_test = model.predict(x_test)
mean_absolute_error(preds_test[:, 0], y_test)

614.4952618254333

## LSTM with scaling

### Scale features

In [17]:
train_mean = np.mean(x_train)
train_std = np.std(x_train)
train_mean, train_std

(1625.2317697228145, 1184.967031656248)

In [18]:
x_train_scaled = (x_train - train_mean) / train_std
y_train_scaled = (y_train - train_mean) / train_std
x_test_scaled =  (x_test - train_mean) / train_std
y_test_scaled = (y_test - train_mean) / train_std

### Train models

In [19]:
model_scaled = get_model(x_train)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=100,
                                                 restore_best_weights=True)

scaled_model_history = model_scaled.fit(x_train_scaled, y_train_scaled, 
                                        validation_split=0.1,
                                        batch_size=128, epochs=1000,
                                       callbacks=early_stopping, verbose=0)

In [20]:
visualize_loss(scaled_model_history, 
               "Training of a LSTM model with scaling");

In [21]:
preds_scaled_test = model_scaled.predict(x_test_scaled)
unscaled_preds_scaled_test = (preds_scaled_test[:, 0] * train_std) + train_mean
mean_absolute_error(unscaled_preds_scaled_test, y_test)

143.87561548056723

# Visualize results

## Visualize global predictions vs reality

In [22]:
preds_without_scaling_vs_real = pd.DataFrame({"predictions": preds_test[:, 0],
                                             "reality": y_test})
preds_with_scaling_vs_real = pd.DataFrame({"predictions": unscaled_preds_scaled_test,
                                             "reality": y_test})

preds_vs_real = pd.concat([preds_without_scaling_vs_real.assign(model="model_without_scaling"),
                          preds_with_scaling_vs_real.assign(model="model_with_scaling")])
preds_vs_real.head()

Unnamed: 0,predictions,reality,model
0,1307.753052,1592.0,model_without_scaling
1,1307.753052,1570.0,model_without_scaling
2,1307.753052,1732.0,model_without_scaling
3,1307.753052,1445.0,model_without_scaling
4,1307.753052,1180.0,model_without_scaling


In [23]:
px.scatter(preds_vs_real, x="predictions", y="reality", color="model",
          trendline="ols", title="Comparaison des prédictions des deux modèles")

The model without scaling fell into a local minima, corresponding with constant predictions

## Visualize predictions vs reality as time series

In [24]:
days = df_one_page.columns[-len(preds_test):]

fig = go.Figure(data=[go.Scatter(x=days, 
                              y=y_test, 
                              name="Number of clicks"),
                      go.Scatter(x=days, 
                              y=preds_test[:, 0], 
                              name="Predictions with model not using scaling"),
                     go.Scatter(x=days, 
                              y=unscaled_preds_scaled_test, 
                              name="Predictions with model using global scaling")])
fig.update_layout(title="Predictions vs reality on test set",
                   xaxis_title="Day",
                   yaxis_title="Clicks")