{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import xlrd\n", "from plotly.graph_objs import Scatter, layout\n", "import plotly\n", "import plotly.offline as py\n", "import plotly.graph_objs as go\n", "import cufflinks as cf\n", "from urllib.request import urlopen\n", "import json\n", "import re\n", "cf.go_offline()###这两句是离线生成图片的设置\n", "cf.set_config_file(offline=True, world_readable=True)\n", "plotly.offline.init_notebook_mode(connected=True)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('data.csv')\n", "df.index = df['Unnamed: 0']\n", "df = df.drop('Unnamed: 0', axis=1)\n", "df_bitcoin = df['Value']\n", "df_gold = df['USD (PM)']\n", "df_gold.drop(index='2016-09-11', axis=0, inplace=True)\n", "df_bitcoin = pd.DataFrame(df_bitcoin)\n", "df_gold = pd.DataFrame(df_gold)\n", "df_bitcoin.rename(columns = {\"Value\": \"value\"}, inplace=True)\n", "df_gold.rename(columns = {\"USD (PM)\": \"value\"}, inplace=True)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# using lagged observations as features\n", "# 对于X(t+n),就有X(t+1), X(t+2), ..., X(t+n-1)作为特征\n", "def generate_time_lags(df, n_lags):\n", " df_n = df.copy()\n", " for i in range(1, n_lags + 1):\n", " df_n[f\"lag{i}\"] = df_n.shift(i)['value']\n", " df_n = df_n.iloc[n_lags:]\n", " return df_n\n", "input_dim = 20\n", "df_features = generate_time_lags(df_gold, input_dim)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# 添加周几作为特征\n", "df_features['time'] = pd.to_datetime(df_features.index, errors='coerce')\n", "df_features = (df_features\n", " .assign(day_of_week=df_features['time'].dt.dayofweek))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# one-hot处理\n", "def onehot_encode_pd(df, col_name):\n", " dummies = pd.get_dummies(df[col_name], prefix=col_name)\n", " return pd.concat([df, dummies], axis=1).drop(columns=[col_name])\n", "lst = ['day_of_week']\n", "for name in lst:\n", " df_features[name] = df_features[name].astype('category')\n", " df_features = onehot_encode_pd(df_features, name)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# 增加是否当天是假期作为特征\n", "from datetime import date\n", "import holidays\n", "US_holidays = holidays.US()\n", "def is_holiday(date):\n", " # date = date.replace(hour = 0)\n", " return 1 if (date in US_holidays) else 0\n", "\n", "def add_holiday_col(df, holidays):\n", " return df.assign(is_holiday = df.index.to_series().apply(is_holiday))\n", "\n", "df_features = add_holiday_col(df_features, US_holidays)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "df_features.drop('time', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "df_features.to_csv('df_features_gold.csv')" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "def feature_label_split(df, target_col):\n", " y = df[[target_col]]\n", " X = df.drop(columns=[target_col])\n", " return X, y\n", "\n", "def train_val_test_split(df, target_col, test_ratio):\n", " val_ratio = test_ratio / (1 - test_ratio)\n", " X, y = feature_label_split(df, target_col)\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, shuffle=False)\n", " X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio, shuffle=False)\n", " return X_train, X_val, X_test, y_train, y_val, y_test\n", "X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(df_features, 'value', 0.2)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# scaling the values \n", "from sklearn.preprocessing import MinMaxScaler\n", "scaler = MinMaxScaler()\n", "X_train_arr = scaler.fit_transform(X_train)\n", "X_val_arr = scaler.transform(X_val)\n", "X_test_arr = scaler.transform(X_test)\n", "\n", "y_train_arr = scaler.fit_transform(y_train)\n", "y_val_arr = scaler.transform(y_val)\n", "y_test_arr = scaler.transform(y_test)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "import torch\n", "train_features = torch.Tensor(X_train_arr)\n", "train_targets = torch.Tensor(y_train_arr)\n", "val_features = torch.Tensor(X_val_arr)\n", "val_targets = torch.Tensor(y_val_arr)\n", "test_features = torch.Tensor(X_test_arr)\n", "test_targets = torch.Tensor(y_test_arr)\n", "from torch.utils.data import TensorDataset, DataLoader\n", "\n", "batch_size = 64\n", "\n", "train = TensorDataset(train_features, train_targets)\n", "val = TensorDataset(val_features, val_targets)\n", "test = TensorDataset(test_features, test_targets)\n", "\n", "train_loader = DataLoader(train, batch_size=batch_size, shuffle=False, drop_last=True)\n", "val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, drop_last=True)\n", "test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, drop_last=True)\n", "test_loader_one = DataLoader(test, batch_size=1, shuffle=False, drop_last=True)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "from torch import nn\n", "class LSTMModel(nn.Module):\n", " def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob):\n", " super(LSTMModel, self).__init__()\n", "\n", " # Defining the number of layers and the nodes in each layer\n", " self.hidden_dim = hidden_dim\n", " self.layer_dim = layer_dim\n", "\n", " # LSTM layers\n", " self.lstm = nn.LSTM(\n", " input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob\n", " )\n", "\n", " # Fully connected layer\n", " self.fc = nn.Linear(hidden_dim, output_dim)\n", "\n", " def forward(self, x):\n", " # Initializing hidden state for first input with zeros\n", " h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(device)\n", "\n", " # Initializing cell state for first input with zeros\n", " c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(device)\n", "\n", " # We need to detach as we are doing truncated backpropagation through time (BPTT)\n", " # If we don't, we'll backprop all the way to the start even after going through another batch\n", " # Forward propagation by passing in the input, hidden state, and cell state into the model\n", " out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))\n", "\n", " # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)\n", " # so that it can fit into the fully connected layer\n", " out = out[:, -1, :]\n", "\n", " # Convert the final state to our desired output shape (batch_size, output_dim)\n", " out = self.fc(out)\n", "\n", " return out" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "class Optimization:\n", " def __init__(self, model, loss_fn, optimizer):\n", " self.model = model\n", " self.loss_fn = loss_fn\n", " self.optimizer = optimizer\n", " self.train_losses = []\n", " self.val_losses = []\n", " \n", " def train_step(self, x, y):\n", " # Sets model to train mode\n", " self.model.train()\n", "\n", " # Makes predictions\n", " yhat = self.model(x).to(device)\n", "\n", " # Computes loss\n", " loss = self.loss_fn(y, yhat)\n", "\n", " # Computes gradients\n", " loss.backward()\n", "\n", " # Updates parameters and zeroes gradients\n", " self.optimizer.step()\n", " self.optimizer.zero_grad()\n", "\n", " # Returns the loss\n", " return loss.item()\n", "\n", " def train(self, train_loader, val_loader, batch_size=64, n_epochs=50, n_features=1):\n", " # model_path = f'models/{self.model}_{datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")}'\n", "\n", " for epoch in range(1, n_epochs + 1):\n", " batch_losses = []\n", " for x_batch, y_batch in train_loader:\n", " x_batch = x_batch.view([batch_size, -1, n_features]).to(device)\n", " y_batch = y_batch.to(device)\n", " loss = self.train_step(x_batch, y_batch)\n", " batch_losses.append(loss)\n", " training_loss = np.mean(batch_losses)\n", " self.train_losses.append(training_loss)\n", "\n", " with torch.no_grad():\n", " batch_val_losses = []\n", " for x_val, y_val in val_loader:\n", " x_val = x_val.view([batch_size, -1, n_features]).to(device)\n", " y_val = y_val.to(device)\n", " self.model.eval()\n", " yhat = self.model(x_val)\n", " val_loss = self.loss_fn(y_val, yhat).item()\n", " batch_val_losses.append(val_loss)\n", " validation_loss = np.mean(batch_val_losses)\n", " self.val_losses.append(validation_loss)\n", "\n", " if (epoch <= 10) | (epoch % 50 == 0):\n", " print(\n", " f\"[{epoch}/{n_epochs}] Training loss: {training_loss:.4f}\\t Validation loss: {validation_loss:.4f}\"\n", " )\n", "\n", " # torch.save(self.model.state_dict(), model_path)\n", " def evaluate(self, test_loader, batch_size=1, n_features=1):\n", " with torch.no_grad():\n", " predictions = []\n", " values = []\n", " for x_test, y_test in test_loader:\n", " x_test = x_test.view([batch_size, -1, n_features]).to(device)\n", " y_test = y_test.to(device)\n", " self.model.eval()\n", " yhat = self.model(x_test)\n", " predictions.append(yhat.cpu().detach().numpy())\n", " values.append(y_test.cpu().detach().numpy())\n", "\n", " return predictions, values\n", "\n", " def plot_losses(self):\n", " plt.plot(self.train_losses, label=\"Training loss\")\n", " plt.plot(self.val_losses, label=\"Validation loss\")\n", " plt.legend()\n", " plt.title(\"Losses\")\n", " plt.show()\n", " plt.close()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "def get_model(model, model_params):\n", " models = {\n", " \"lstm\": LSTMModel,\n", " }\n", " return models.get(model.lower())(**model_params)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch.optim as optim\n", "\n", "input_dim = len(X_train.columns)\n", "output_dim = 1\n", "hidden_dim = 64\n", "layer_dim = 3\n", "batch_size = 64\n", "dropout = 0.2\n", "n_epochs = 100\n", "learning_rate = 1e-3\n", "weight_decay = 1e-6\n", "\n", "model_params = {'input_dim': input_dim,\n", " 'hidden_dim' : hidden_dim,\n", " 'layer_dim' : layer_dim,\n", " 'output_dim' : output_dim,\n", " 'dropout_prob' : dropout}\n", "\n", "model = get_model('lstm', model_params).to(device)\n", "\n", "loss_fn = nn.MSELoss(reduction=\"mean\")\n", "optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)\n", "\n", "opt = Optimization(model=model, loss_fn=loss_fn, optimizer=optimizer)\n", "opt.train(train_loader, val_loader, batch_size=batch_size, n_epochs=n_epochs, n_features=input_dim)\n", "opt.plot_losses()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 41m 59s\n", "predictions, values = opt.evaluate(test_loader_one, batch_size=1, n_features=input_dim)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 还原成真实值\n", "def inverse_transform(scaler, df, columns):\n", " for col in columns:\n", " df[col] = scaler.inverse_transform(df[col])\n", " return df\n", "\n", "\n", "def format_predictions(predictions, values, df_test, scaler):\n", " vals = np.concatenate(values, axis=0).ravel()\n", " preds = np.concatenate(predictions, axis=0).ravel()\n", " df_result = pd.DataFrame(data={\"value\": vals, \"prediction\": preds}, index=df_test.head(len(vals)).index)\n", " df_result = df_result.sort_index()\n", " df_result = inverse_transform(scaler, df_result, [[\"value\", \"prediction\"]])\n", " return df_result\n", "\n", "\n", "df_result = format_predictions(predictions, values, X_test, scaler)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 计算误差\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", "\n", "def calculate_metrics(df):\n", " return {'mae' : mean_absolute_error(df.value, df.prediction),\n", " 'rmse' : mean_squared_error(df.value, df.prediction) ** 0.5,\n", " 'r2' : r2_score(df.value, df.prediction)}\n", "\n", "result_metrics = calculate_metrics(df_result)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# LinearRegression用作比较\n", "from sklearn.linear_model import LinearRegression\n", "\n", "def build_baseline_model(df, test_ratio, target_col):\n", " X, y = feature_label_split(df, target_col)\n", " X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=test_ratio, shuffle=False\n", " )\n", " model = LinearRegression()\n", " model.fit(X_train, y_train)\n", " prediction = model.predict(X_test)\n", "\n", " result = pd.DataFrame(y_test)\n", " result[\"prediction\"] = prediction\n", " result = result.sort_index()\n", "\n", " return result\n", "\n", "df_baseline = build_baseline_model(df_features, 0.2, 'value')\n", "baseline_metrics = calculate_metrics(df_baseline)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 可视化结果\n", "import plotly.graph_objs as go\n", "from plotly.offline import iplot\n", "import plotly.offline as pyo\n", "\n", "def plot_predictions(df_result, df_baseline):\n", " data = []\n", " \n", " value = go.Scatter(\n", " x=df_result.index,\n", " y=df_result.value,\n", " mode=\"lines\",\n", " name=\"values\",\n", " marker=dict(),\n", " text=df_result.index,\n", " line=dict(color=\"rgba(0,0,0, 0.3)\"),\n", " )\n", " data.append(value)\n", "\n", " baseline = go.Scatter(\n", " x=df_baseline.index,\n", " y=df_baseline.prediction,\n", " mode=\"lines\",\n", " line={\"dash\": \"dot\"},\n", " name='linear regression',\n", " marker=dict(),\n", " text=df_baseline.index,\n", " opacity=0.8,\n", " )\n", " data.append(baseline)\n", " \n", " prediction = go.Scatter(\n", " x=df_result.index,\n", " y=df_result.prediction,\n", " mode=\"lines\",\n", " line={\"dash\": \"dot\"},\n", " name='predictions',\n", " marker=dict(),\n", " text=df_result.index,\n", " opacity=0.8,\n", " )\n", " data.append(prediction)\n", " \n", " layout = dict(\n", " title=\"Predictions vs Actual Values for the dataset\",\n", " xaxis=dict(title=\"Time\", ticklen=5, zeroline=False),\n", " yaxis=dict(title=\"Value\", ticklen=5, zeroline=False),\n", " )\n", " fig = go.Figure(data=data, layout=layout)\n", " fig.show()\n", " \n", " \n", "# Set notebook mode to work in offline\n", "pyo.init_notebook_mode()\n", "\n", "plot_predictions(df_result, df_baseline)" ] } ], "metadata": { "interpreter": { "hash": "73e8fb81fc9d21637ba62ed4f9412d39843bbeeb61edb8163afd2f9314d52c65" }, "kernelspec": { "display_name": "Python 3.7.6 64-bit (system)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }