{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import xlrd\n",
"from plotly.graph_objs import Scatter, layout\n",
"import plotly\n",
"import plotly.offline as py\n",
"import plotly.graph_objs as go\n",
"import cufflinks as cf\n",
"from urllib.request import urlopen\n",
"import json\n",
"import re\n",
"cf.go_offline()###这两句是离线生成图片的设置\n",
"cf.set_config_file(offline=True, world_readable=True)\n",
"plotly.offline.init_notebook_mode(connected=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('data.csv')\n",
"df.index = df['Unnamed: 0']\n",
"df = df.drop('Unnamed: 0', axis=1)\n",
"df_bitcoin = df['Value']\n",
"df_gold = df['USD (PM)']\n",
"df_gold.drop(index='2016-09-11', axis=0, inplace=True)\n",
"df_bitcoin = pd.DataFrame(df_bitcoin)\n",
"df_gold = pd.DataFrame(df_gold)\n",
"df_bitcoin.rename(columns = {\"Value\": \"value\"}, inplace=True)\n",
"df_gold.rename(columns = {\"USD (PM)\": \"value\"}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# using lagged observations as features\n",
"# 对于X(t+n),就有X(t+1), X(t+2), ..., X(t+n-1)作为特征\n",
"def generate_time_lags(df, n_lags):\n",
" df_n = df.copy()\n",
" for i in range(1, n_lags + 1):\n",
" df_n[f\"lag{i}\"] = df_n.shift(i)['value']\n",
" df_n = df_n.iloc[n_lags:]\n",
" return df_n\n",
"input_dim = 20\n",
"df_features = generate_time_lags(df_gold, input_dim)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# 添加周几作为特征\n",
"df_features['time'] = pd.to_datetime(df_features.index, errors='coerce')\n",
"df_features = (df_features\n",
" .assign(day_of_week=df_features['time'].dt.dayofweek))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# one-hot处理\n",
"def onehot_encode_pd(df, col_name):\n",
" dummies = pd.get_dummies(df[col_name], prefix=col_name)\n",
" return pd.concat([df, dummies], axis=1).drop(columns=[col_name])\n",
"lst = ['day_of_week']\n",
"for name in lst:\n",
" df_features[name] = df_features[name].astype('category')\n",
" df_features = onehot_encode_pd(df_features, name)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# 增加是否当天是假期作为特征\n",
"from datetime import date\n",
"import holidays\n",
"US_holidays = holidays.US()\n",
"def is_holiday(date):\n",
" # date = date.replace(hour = 0)\n",
" return 1 if (date in US_holidays) else 0\n",
"\n",
"def add_holiday_col(df, holidays):\n",
" return df.assign(is_holiday = df.index.to_series().apply(is_holiday))\n",
"\n",
"df_features = add_holiday_col(df_features, US_holidays)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df_features.drop('time', axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"df_features.to_csv('df_features_gold.csv')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"def feature_label_split(df, target_col):\n",
" y = df[[target_col]]\n",
" X = df.drop(columns=[target_col])\n",
" return X, y\n",
"\n",
"def train_val_test_split(df, target_col, test_ratio):\n",
" val_ratio = test_ratio / (1 - test_ratio)\n",
" X, y = feature_label_split(df, target_col)\n",
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, shuffle=False)\n",
" X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio, shuffle=False)\n",
" return X_train, X_val, X_test, y_train, y_val, y_test\n",
"X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(df_features, 'value', 0.2)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# scaling the values \n",
"from sklearn.preprocessing import MinMaxScaler\n",
"scaler = MinMaxScaler()\n",
"X_train_arr = scaler.fit_transform(X_train)\n",
"X_val_arr = scaler.transform(X_val)\n",
"X_test_arr = scaler.transform(X_test)\n",
"\n",
"y_train_arr = scaler.fit_transform(y_train)\n",
"y_val_arr = scaler.transform(y_val)\n",
"y_test_arr = scaler.transform(y_test)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"train_features = torch.Tensor(X_train_arr)\n",
"train_targets = torch.Tensor(y_train_arr)\n",
"val_features = torch.Tensor(X_val_arr)\n",
"val_targets = torch.Tensor(y_val_arr)\n",
"test_features = torch.Tensor(X_test_arr)\n",
"test_targets = torch.Tensor(y_test_arr)\n",
"from torch.utils.data import TensorDataset, DataLoader\n",
"\n",
"batch_size = 64\n",
"\n",
"train = TensorDataset(train_features, train_targets)\n",
"val = TensorDataset(val_features, val_targets)\n",
"test = TensorDataset(test_features, test_targets)\n",
"\n",
"train_loader = DataLoader(train, batch_size=batch_size, shuffle=False, drop_last=True)\n",
"val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, drop_last=True)\n",
"test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, drop_last=True)\n",
"test_loader_one = DataLoader(test, batch_size=1, shuffle=False, drop_last=True)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"from torch import nn\n",
"class LSTMModel(nn.Module):\n",
" def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob):\n",
" super(LSTMModel, self).__init__()\n",
"\n",
" # Defining the number of layers and the nodes in each layer\n",
" self.hidden_dim = hidden_dim\n",
" self.layer_dim = layer_dim\n",
"\n",
" # LSTM layers\n",
" self.lstm = nn.LSTM(\n",
" input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob\n",
" )\n",
"\n",
" # Fully connected layer\n",
" self.fc = nn.Linear(hidden_dim, output_dim)\n",
"\n",
" def forward(self, x):\n",
" # Initializing hidden state for first input with zeros\n",
" h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(device)\n",
"\n",
" # Initializing cell state for first input with zeros\n",
" c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(device)\n",
"\n",
" # We need to detach as we are doing truncated backpropagation through time (BPTT)\n",
" # If we don't, we'll backprop all the way to the start even after going through another batch\n",
" # Forward propagation by passing in the input, hidden state, and cell state into the model\n",
" out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))\n",
"\n",
" # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)\n",
" # so that it can fit into the fully connected layer\n",
" out = out[:, -1, :]\n",
"\n",
" # Convert the final state to our desired output shape (batch_size, output_dim)\n",
" out = self.fc(out)\n",
"\n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"class Optimization:\n",
" def __init__(self, model, loss_fn, optimizer):\n",
" self.model = model\n",
" self.loss_fn = loss_fn\n",
" self.optimizer = optimizer\n",
" self.train_losses = []\n",
" self.val_losses = []\n",
" \n",
" def train_step(self, x, y):\n",
" # Sets model to train mode\n",
" self.model.train()\n",
"\n",
" # Makes predictions\n",
" yhat = self.model(x).to(device)\n",
"\n",
" # Computes loss\n",
" loss = self.loss_fn(y, yhat)\n",
"\n",
" # Computes gradients\n",
" loss.backward()\n",
"\n",
" # Updates parameters and zeroes gradients\n",
" self.optimizer.step()\n",
" self.optimizer.zero_grad()\n",
"\n",
" # Returns the loss\n",
" return loss.item()\n",
"\n",
" def train(self, train_loader, val_loader, batch_size=64, n_epochs=50, n_features=1):\n",
" # model_path = f'models/{self.model}_{datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")}'\n",
"\n",
" for epoch in range(1, n_epochs + 1):\n",
" batch_losses = []\n",
" for x_batch, y_batch in train_loader:\n",
" x_batch = x_batch.view([batch_size, -1, n_features]).to(device)\n",
" y_batch = y_batch.to(device)\n",
" loss = self.train_step(x_batch, y_batch)\n",
" batch_losses.append(loss)\n",
" training_loss = np.mean(batch_losses)\n",
" self.train_losses.append(training_loss)\n",
"\n",
" with torch.no_grad():\n",
" batch_val_losses = []\n",
" for x_val, y_val in val_loader:\n",
" x_val = x_val.view([batch_size, -1, n_features]).to(device)\n",
" y_val = y_val.to(device)\n",
" self.model.eval()\n",
" yhat = self.model(x_val)\n",
" val_loss = self.loss_fn(y_val, yhat).item()\n",
" batch_val_losses.append(val_loss)\n",
" validation_loss = np.mean(batch_val_losses)\n",
" self.val_losses.append(validation_loss)\n",
"\n",
" if (epoch <= 10) | (epoch % 50 == 0):\n",
" print(\n",
" f\"[{epoch}/{n_epochs}] Training loss: {training_loss:.4f}\\t Validation loss: {validation_loss:.4f}\"\n",
" )\n",
"\n",
" # torch.save(self.model.state_dict(), model_path)\n",
" def evaluate(self, test_loader, batch_size=1, n_features=1):\n",
" with torch.no_grad():\n",
" predictions = []\n",
" values = []\n",
" for x_test, y_test in test_loader:\n",
" x_test = x_test.view([batch_size, -1, n_features]).to(device)\n",
" y_test = y_test.to(device)\n",
" self.model.eval()\n",
" yhat = self.model(x_test)\n",
" predictions.append(yhat.cpu().detach().numpy())\n",
" values.append(y_test.cpu().detach().numpy())\n",
"\n",
" return predictions, values\n",
"\n",
" def plot_losses(self):\n",
" plt.plot(self.train_losses, label=\"Training loss\")\n",
" plt.plot(self.val_losses, label=\"Validation loss\")\n",
" plt.legend()\n",
" plt.title(\"Losses\")\n",
" plt.show()\n",
" plt.close()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"def get_model(model, model_params):\n",
" models = {\n",
" \"lstm\": LSTMModel,\n",
" }\n",
" return models.get(model.lower())(**model_params)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch.optim as optim\n",
"\n",
"input_dim = len(X_train.columns)\n",
"output_dim = 1\n",
"hidden_dim = 64\n",
"layer_dim = 3\n",
"batch_size = 64\n",
"dropout = 0.2\n",
"n_epochs = 100\n",
"learning_rate = 1e-3\n",
"weight_decay = 1e-6\n",
"\n",
"model_params = {'input_dim': input_dim,\n",
" 'hidden_dim' : hidden_dim,\n",
" 'layer_dim' : layer_dim,\n",
" 'output_dim' : output_dim,\n",
" 'dropout_prob' : dropout}\n",
"\n",
"model = get_model('lstm', model_params).to(device)\n",
"\n",
"loss_fn = nn.MSELoss(reduction=\"mean\")\n",
"optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)\n",
"\n",
"opt = Optimization(model=model, loss_fn=loss_fn, optimizer=optimizer)\n",
"opt.train(train_loader, val_loader, batch_size=batch_size, n_epochs=n_epochs, n_features=input_dim)\n",
"opt.plot_losses()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 41m 59s\n",
"predictions, values = opt.evaluate(test_loader_one, batch_size=1, n_features=input_dim)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 还原成真实值\n",
"def inverse_transform(scaler, df, columns):\n",
" for col in columns:\n",
" df[col] = scaler.inverse_transform(df[col])\n",
" return df\n",
"\n",
"\n",
"def format_predictions(predictions, values, df_test, scaler):\n",
" vals = np.concatenate(values, axis=0).ravel()\n",
" preds = np.concatenate(predictions, axis=0).ravel()\n",
" df_result = pd.DataFrame(data={\"value\": vals, \"prediction\": preds}, index=df_test.head(len(vals)).index)\n",
" df_result = df_result.sort_index()\n",
" df_result = inverse_transform(scaler, df_result, [[\"value\", \"prediction\"]])\n",
" return df_result\n",
"\n",
"\n",
"df_result = format_predictions(predictions, values, X_test, scaler)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 计算误差\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"def calculate_metrics(df):\n",
" return {'mae' : mean_absolute_error(df.value, df.prediction),\n",
" 'rmse' : mean_squared_error(df.value, df.prediction) ** 0.5,\n",
" 'r2' : r2_score(df.value, df.prediction)}\n",
"\n",
"result_metrics = calculate_metrics(df_result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# LinearRegression用作比较\n",
"from sklearn.linear_model import LinearRegression\n",
"\n",
"def build_baseline_model(df, test_ratio, target_col):\n",
" X, y = feature_label_split(df, target_col)\n",
" X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=test_ratio, shuffle=False\n",
" )\n",
" model = LinearRegression()\n",
" model.fit(X_train, y_train)\n",
" prediction = model.predict(X_test)\n",
"\n",
" result = pd.DataFrame(y_test)\n",
" result[\"prediction\"] = prediction\n",
" result = result.sort_index()\n",
"\n",
" return result\n",
"\n",
"df_baseline = build_baseline_model(df_features, 0.2, 'value')\n",
"baseline_metrics = calculate_metrics(df_baseline)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 可视化结果\n",
"import plotly.graph_objs as go\n",
"from plotly.offline import iplot\n",
"import plotly.offline as pyo\n",
"\n",
"def plot_predictions(df_result, df_baseline):\n",
" data = []\n",
" \n",
" value = go.Scatter(\n",
" x=df_result.index,\n",
" y=df_result.value,\n",
" mode=\"lines\",\n",
" name=\"values\",\n",
" marker=dict(),\n",
" text=df_result.index,\n",
" line=dict(color=\"rgba(0,0,0, 0.3)\"),\n",
" )\n",
" data.append(value)\n",
"\n",
" baseline = go.Scatter(\n",
" x=df_baseline.index,\n",
" y=df_baseline.prediction,\n",
" mode=\"lines\",\n",
" line={\"dash\": \"dot\"},\n",
" name='linear regression',\n",
" marker=dict(),\n",
" text=df_baseline.index,\n",
" opacity=0.8,\n",
" )\n",
" data.append(baseline)\n",
" \n",
" prediction = go.Scatter(\n",
" x=df_result.index,\n",
" y=df_result.prediction,\n",
" mode=\"lines\",\n",
" line={\"dash\": \"dot\"},\n",
" name='predictions',\n",
" marker=dict(),\n",
" text=df_result.index,\n",
" opacity=0.8,\n",
" )\n",
" data.append(prediction)\n",
" \n",
" layout = dict(\n",
" title=\"Predictions vs Actual Values for the dataset\",\n",
" xaxis=dict(title=\"Time\", ticklen=5, zeroline=False),\n",
" yaxis=dict(title=\"Value\", ticklen=5, zeroline=False),\n",
" )\n",
" fig = go.Figure(data=data, layout=layout)\n",
" fig.show()\n",
" \n",
" \n",
"# Set notebook mode to work in offline\n",
"pyo.init_notebook_mode()\n",
"\n",
"plot_predictions(df_result, df_baseline)"
]
}
],
"metadata": {
"interpreter": {
"hash": "73e8fb81fc9d21637ba62ed4f9412d39843bbeeb61edb8163afd2f9314d52c65"
},
"kernelspec": {
"display_name": "Python 3.7.6 64-bit (system)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}