In [1]:
import pandas as pd
import numpy as np
import xlrd
from plotly.graph_objs import Scatter, layout
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import cufflinks as cf
from urllib.request import urlopen
import json
import re
cf.go_offline()###这两句是离线生成图片的设置
cf.set_config_file(offline=True, world_readable=True)
plotly.offline.init_notebook_mode(connected=True)

In [2]:
df = pd.read_csv('data.csv')
df.index = df['Unnamed: 0']
df = df.drop('Unnamed: 0', axis=1)
df_bitcoin = df['Value']
df_gold = df['USD (PM)']
df_gold.drop(index='2016-09-11', axis=0, inplace=True)
df_bitcoin = pd.DataFrame(df_bitcoin)
df_gold = pd.DataFrame(df_gold)
df_bitcoin.rename(columns = {"Value": "value"}, inplace=True)
df_gold.rename(columns = {"USD (PM)": "value"}, inplace=True)

In [3]:
# using lagged observations as features
# 对于X(t+n),就有X(t+1), X(t+2), ..., X(t+n-1)作为特征
def generate_time_lags(df, n_lags):
 df_n = df.copy()
 for i in range(1, n_lags + 1):
 df_n[f"lag{i}"] = df_n.shift(i)['value']
 df_n = df_n.iloc[n_lags:]
 return df_n
input_dim = 20
df_features = generate_time_lags(df_gold, input_dim)

In [4]:
# 添加周几作为特征
df_features['time'] = pd.to_datetime(df_features.index, errors='coerce')
df_features = (df_features
 .assign(day_of_week=df_features['time'].dt.dayofweek))

In [5]:
# one-hot处理
def onehot_encode_pd(df, col_name):
 dummies = pd.get_dummies(df[col_name], prefix=col_name)
 return pd.concat([df, dummies], axis=1).drop(columns=[col_name])
lst = ['day_of_week']
for name in lst:
 df_features[name] = df_features[name].astype('category')
 df_features = onehot_encode_pd(df_features, name)

In [6]:
# 增加是否当天是假期作为特征
from datetime import date
import holidays
US_holidays = holidays.US()
def is_holiday(date):
 # date = date.replace(hour = 0)
 return 1 if (date in US_holidays) else 0

def add_holiday_col(df, holidays):
 return df.assign(is_holiday = df.index.to_series().apply(is_holiday))

df_features = add_holiday_col(df_features, US_holidays)

In [7]:
df_features.drop('time', axis=1, inplace=True)

In [32]:
df_features.to_csv('df_features_gold.csv')

In [21]:
from sklearn.model_selection import train_test_split

def feature_label_split(df, target_col):
 y = df[[target_col]]
 X = df.drop(columns=[target_col])
 return X, y

def train_val_test_split(df, target_col, test_ratio):
 val_ratio = test_ratio / (1 - test_ratio)
 X, y = feature_label_split(df, target_col)
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, shuffle=False)
 X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio, shuffle=False)
 return X_train, X_val, X_test, y_train, y_val, y_test
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(df_features, 'value', 0.2)

In [22]:
# scaling the values 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_arr = scaler.fit_transform(X_train)
X_val_arr = scaler.transform(X_val)
X_test_arr = scaler.transform(X_test)

y_train_arr = scaler.fit_transform(y_train)
y_val_arr = scaler.transform(y_val)
y_test_arr = scaler.transform(y_test)

In [23]:
import torch
train_features = torch.Tensor(X_train_arr)
train_targets = torch.Tensor(y_train_arr)
val_features = torch.Tensor(X_val_arr)
val_targets = torch.Tensor(y_val_arr)
test_features = torch.Tensor(X_test_arr)
test_targets = torch.Tensor(y_test_arr)
from torch.utils.data import TensorDataset, DataLoader

batch_size = 64

train = TensorDataset(train_features, train_targets)
val = TensorDataset(val_features, val_targets)
test = TensorDataset(test_features, test_targets)

train_loader = DataLoader(train, batch_size=batch_size, shuffle=False, drop_last=True)
val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader_one = DataLoader(test, batch_size=1, shuffle=False, drop_last=True)

In [24]:
from torch import nn
class LSTMModel(nn.Module):
 def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob):
 super(LSTMModel, self).__init__()

 # Defining the number of layers and the nodes in each layer
 self.hidden_dim = hidden_dim
 self.layer_dim = layer_dim

 # LSTM layers
 self.lstm = nn.LSTM(
 input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob
 )

 # Fully connected layer
 self.fc = nn.Linear(hidden_dim, output_dim)

 def forward(self, x):
 # Initializing hidden state for first input with zeros
 h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(device)

 # Initializing cell state for first input with zeros
 c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(device)

 # We need to detach as we are doing truncated backpropagation through time (BPTT)
 # If we don't, we'll backprop all the way to the start even after going through another batch
 # Forward propagation by passing in the input, hidden state, and cell state into the model
 out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

 # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
 # so that it can fit into the fully connected layer
 out = out[:, -1, :]

 # Convert the final state to our desired output shape (batch_size, output_dim)
 out = self.fc(out)

 return out

In [25]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [28]:
import matplotlib.pyplot as plt
class Optimization:
 def __init__(self, model, loss_fn, optimizer):
 self.model = model
 self.loss_fn = loss_fn
 self.optimizer = optimizer
 self.train_losses = []
 self.val_losses = []
 
 def train_step(self, x, y):
 # Sets model to train mode
 self.model.train()

 # Makes predictions
 yhat = self.model(x).to(device)

 # Computes loss
 loss = self.loss_fn(y, yhat)

 # Computes gradients
 loss.backward()

 # Updates parameters and zeroes gradients
 self.optimizer.step()
 self.optimizer.zero_grad()

 # Returns the loss
 return loss.item()

 def train(self, train_loader, val_loader, batch_size=64, n_epochs=50, n_features=1):
 # model_path = f'models/{self.model}_{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'

 for epoch in range(1, n_epochs + 1):
 batch_losses = []
 for x_batch, y_batch in train_loader:
 x_batch = x_batch.view([batch_size, -1, n_features]).to(device)
 y_batch = y_batch.to(device)
 loss = self.train_step(x_batch, y_batch)
 batch_losses.append(loss)
 training_loss = np.mean(batch_losses)
 self.train_losses.append(training_loss)

 with torch.no_grad():
 batch_val_losses = []
 for x_val, y_val in val_loader:
 x_val = x_val.view([batch_size, -1, n_features]).to(device)
 y_val = y_val.to(device)
 self.model.eval()
 yhat = self.model(x_val)
 val_loss = self.loss_fn(y_val, yhat).item()
 batch_val_losses.append(val_loss)
 validation_loss = np.mean(batch_val_losses)
 self.val_losses.append(validation_loss)

 if (epoch <= 10) | (epoch % 50 == 0):
 print(
 f"[{epoch}/{n_epochs}] Training loss: {training_loss:.4f}\t Validation loss: {validation_loss:.4f}"
 )

 # torch.save(self.model.state_dict(), model_path)
 def evaluate(self, test_loader, batch_size=1, n_features=1):
 with torch.no_grad():
 predictions = []
 values = []
 for x_test, y_test in test_loader:
 x_test = x_test.view([batch_size, -1, n_features]).to(device)
 y_test = y_test.to(device)
 self.model.eval()
 yhat = self.model(x_test)
 predictions.append(yhat.cpu().detach().numpy())
 values.append(y_test.cpu().detach().numpy())

 return predictions, values

 def plot_losses(self):
 plt.plot(self.train_losses, label="Training loss")
 plt.plot(self.val_losses, label="Validation loss")
 plt.legend()
 plt.title("Losses")
 plt.show()
 plt.close()

In [29]:
def get_model(model, model_params):
 models = {
 "lstm": LSTMModel,
 }
 return models.get(model.lower())(**model_params)

In [None]:
import torch.optim as optim

input_dim = len(X_train.columns)
output_dim = 1
hidden_dim = 64
layer_dim = 3
batch_size = 64
dropout = 0.2
n_epochs = 100
learning_rate = 1e-3
weight_decay = 1e-6

model_params = {'input_dim': input_dim,
 'hidden_dim' : hidden_dim,
 'layer_dim' : layer_dim,
 'output_dim' : output_dim,
 'dropout_prob' : dropout}

model = get_model('lstm', model_params).to(device)

loss_fn = nn.MSELoss(reduction="mean")
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

opt = Optimization(model=model, loss_fn=loss_fn, optimizer=optimizer)
opt.train(train_loader, val_loader, batch_size=batch_size, n_epochs=n_epochs, n_features=input_dim)
opt.plot_losses()

In [None]:
# 41m 59s
predictions, values = opt.evaluate(test_loader_one, batch_size=1, n_features=input_dim)

In [None]:
# 还原成真实值
def inverse_transform(scaler, df, columns):
 for col in columns:
 df[col] = scaler.inverse_transform(df[col])
 return df


def format_predictions(predictions, values, df_test, scaler):
 vals = np.concatenate(values, axis=0).ravel()
 preds = np.concatenate(predictions, axis=0).ravel()
 df_result = pd.DataFrame(data={"value": vals, "prediction": preds}, index=df_test.head(len(vals)).index)
 df_result = df_result.sort_index()
 df_result = inverse_transform(scaler, df_result, [["value", "prediction"]])
 return df_result


df_result = format_predictions(predictions, values, X_test, scaler)

In [None]:
# 计算误差
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def calculate_metrics(df):
 return {'mae' : mean_absolute_error(df.value, df.prediction),
 'rmse' : mean_squared_error(df.value, df.prediction) ** 0.5,
 'r2' : r2_score(df.value, df.prediction)}

result_metrics = calculate_metrics(df_result)

In [None]:
# LinearRegression用作比较
from sklearn.linear_model import LinearRegression

def build_baseline_model(df, test_ratio, target_col):
 X, y = feature_label_split(df, target_col)
 X_train, X_test, y_train, y_test = train_test_split(
 X, y, test_size=test_ratio, shuffle=False
 )
 model = LinearRegression()
 model.fit(X_train, y_train)
 prediction = model.predict(X_test)

 result = pd.DataFrame(y_test)
 result["prediction"] = prediction
 result = result.sort_index()

 return result

df_baseline = build_baseline_model(df_features, 0.2, 'value')
baseline_metrics = calculate_metrics(df_baseline)

In [None]:
# 可视化结果
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.offline as pyo

def plot_predictions(df_result, df_baseline):
 data = []
 
 value = go.Scatter(
 x=df_result.index,
 y=df_result.value,
 mode="lines",
 name="values",
 marker=dict(),
 text=df_result.index,
 line=dict(color="rgba(0,0,0, 0.3)"),
 )
 data.append(value)

 baseline = go.Scatter(
 x=df_baseline.index,
 y=df_baseline.prediction,
 mode="lines",
 line={"dash": "dot"},
 name='linear regression',
 marker=dict(),
 text=df_baseline.index,
 opacity=0.8,
 )
 data.append(baseline)
 
 prediction = go.Scatter(
 x=df_result.index,
 y=df_result.prediction,
 mode="lines",
 line={"dash": "dot"},
 name='predictions',
 marker=dict(),
 text=df_result.index,
 opacity=0.8,
 )
 data.append(prediction)
 
 layout = dict(
 title="Predictions vs Actual Values for the dataset",
 xaxis=dict(title="Time", ticklen=5, zeroline=False),
 yaxis=dict(title="Value", ticklen=5, zeroline=False),
 )
 fig = go.Figure(data=data, layout=layout)
 fig.show()
 
 
# Set notebook mode to work in offline
pyo.init_notebook_mode()

plot_predictions(df_result, df_baseline)