{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Bayesian Probabilistic Matrix Factorization\n", "\n", "**Published**: November 6, 2020\n", "\n", "**Author**: Xinyu Chen [[**GitHub homepage**](https://github.com/xinychen)]\n", "\n", "**Download**: This Jupyter notebook is at our GitHub repository. If you want to evaluate the code, please download the notebook from the [**transdim**](https://github.com/xinychen/transdim/blob/master/imputer/BPMF.ipynb) repository.\n", "\n", "This notebook shows how to implement the Bayesian Probabilistic Matrix Factorization (BPMF), a fully Bayesian matrix factorization model, on some real-world data sets. For an in-depth discussion of BPMF, please see [1].\n", "\n", "
\n", "\n", "[1] Ruslan Salakhutdinov, Andriy Mnih (2008). Bayesian probabilistic matrix factorization using Markov chain Monte Carlo. ICML 2008. [PDF] [Matlab code] \n", "\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Import some necessary packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from numpy.linalg import inv as inv\n", "from numpy.random import normal as normrnd\n", "from scipy.linalg import khatri_rao as kr_prod\n", "from scipy.stats import wishart\n", "from numpy.linalg import solve as solve\n", "from scipy.linalg import cholesky as cholesky_upper\n", "from scipy.linalg import solve_triangular as solve_ut\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Define some functions" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def mvnrnd_pre(mu, Lambda):\n", " src = normrnd(size = (mu.shape[0],))\n", " return solve_ut(cholesky_upper(Lambda, overwrite_a = True, check_finite = False), \n", " src, lower = False, check_finite = False, overwrite_b = True) + mu\n", "\n", "def cov_mat(mat, mat_bar):\n", " mat = mat - mat_bar\n", " return mat.T @ mat" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Sample factor $\\boldsymbol{W}$" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def sample_factor_w(tau_sparse_mat, tau_ind, W, X, tau, beta0 = 1, vargin = 0):\n", " \"\"\"Sampling N-by-R factor matrix W and its hyperparameters (mu_w, Lambda_w).\"\"\"\n", " \n", " dim1, rank = W.shape\n", " W_bar = np.mean(W, axis = 0)\n", " temp = dim1 / (dim1 + beta0)\n", " var_mu_hyper = temp * W_bar\n", " var_W_hyper = inv(np.eye(rank) + cov_mat(W, W_bar) + temp * beta0 * np.outer(W_bar, W_bar))\n", " var_Lambda_hyper = wishart.rvs(df = dim1 + rank, scale = var_W_hyper)\n", " var_mu_hyper = mvnrnd_pre(var_mu_hyper, (dim1 + beta0) * var_Lambda_hyper)\n", " \n", " if dim1 * rank ** 2 > 1e+8:\n", " vargin = 1\n", " \n", " if vargin == 0:\n", " var1 = X.T\n", " var2 = kr_prod(var1, var1)\n", " var3 = (var2 @ tau_ind.T).reshape([rank, rank, dim1]) + var_Lambda_hyper[:, :, np.newaxis]\n", " var4 = var1 @ tau_sparse_mat.T + (var_Lambda_hyper @ var_mu_hyper)[:, np.newaxis]\n", " for i in range(dim1):\n", " W[i, :] = mvnrnd_pre(solve(var3[:, :, i], var4[:, i]), var3[:, :, i])\n", " elif vargin == 1:\n", " for i in range(dim1):\n", " pos0 = np.where(sparse_mat[i, :] != 0)\n", " Xt = X[pos0[0], :]\n", " var_mu = tau * Xt.T @ sparse_mat[i, pos0[0]] + var_Lambda_hyper @ var_mu_hyper\n", " var_Lambda = tau * Xt.T @ Xt + var_Lambda_hyper\n", " W[i, :] = mvnrnd_pre(solve(var_Lambda, var_mu), var_Lambda)\n", " \n", " return W" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Sample factor $\\boldsymbol{X}$" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def sample_factor_x(tau_sparse_mat, tau_ind, W, X, beta0 = 1):\n", " \"\"\"Sampling T-by-R factor matrix X and its hyperparameters (mu_x, Lambda_x).\"\"\"\n", " \n", " dim2, rank = X.shape\n", " X_bar = np.mean(X, axis = 0)\n", " temp = dim2 / (dim2 + beta0)\n", " var_mu_hyper = temp * X_bar\n", " var_X_hyper = inv(np.eye(rank) + cov_mat(X, X_bar) + temp * beta0 * np.outer(X_bar, X_bar))\n", " var_Lambda_hyper = wishart.rvs(df = dim2 + rank, scale = var_X_hyper)\n", " var_mu_hyper = mvnrnd_pre(var_mu_hyper, (dim2 + beta0) * var_Lambda_hyper)\n", " \n", " var1 = W.T\n", " var2 = kr_prod(var1, var1)\n", " var3 = (var2 @ tau_ind).reshape([rank, rank, dim2]) + var_Lambda_hyper[:, :, np.newaxis]\n", " var4 = var1 @ tau_sparse_mat + (var_Lambda_hyper @ var_mu_hyper)[:, np.newaxis]\n", " for t in range(dim2):\n", " X[t, :] = mvnrnd_pre(solve(var3[:, :, t], var4[:, t]), var3[:, :, t])\n", "\n", " return X" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Sampling Precision $\\tau$" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def sample_precision_tau(sparse_mat, mat_hat, ind):\n", " var_alpha = 1e-6 + 0.5 * np.sum(ind)\n", " var_beta = 1e-6 + 0.5 * np.sum(((sparse_mat - mat_hat) ** 2) * ind)\n", " return np.random.gamma(var_alpha, 1 / var_beta)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def compute_mape(var, var_hat):\n", " return np.sum(np.abs(var - var_hat) / var) / var.shape[0]\n", "\n", "def compute_rmse(var, var_hat):\n", " return np.sqrt(np.sum((var - var_hat) ** 2) / var.shape[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### BPMF Implementation\n", "\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter):\n", " \"\"\"Bayesian Probabilistic Matrix Factorization, BPMF.\"\"\"\n", " \n", " dim1, dim2 = sparse_mat.shape\n", " W = init[\"W\"]\n", " X = init[\"X\"]\n", " if np.isnan(sparse_mat).any() == False:\n", " ind = sparse_mat != 0\n", " pos_obs = np.where(ind)\n", " pos_test = np.where((dense_mat != 0) & (sparse_mat == 0))\n", " elif np.isnan(sparse_mat).any() == True:\n", " pos_test = np.where((dense_mat != 0) & (np.isnan(sparse_mat)))\n", " ind = ~np.isnan(sparse_mat)\n", " pos_obs = np.where(ind)\n", " sparse_mat[np.isnan(sparse_mat)] = 0\n", " dense_test = dense_mat[pos_test]\n", " del dense_mat\n", " tau = 1\n", " W_plus = np.zeros((dim1, rank))\n", " X_plus = np.zeros((dim2, rank))\n", " temp_hat = np.zeros(sparse_mat.shape)\n", " show_iter = 200\n", " mat_hat_plus = np.zeros(sparse_mat.shape)\n", " for it in range(burn_iter + gibbs_iter):\n", " tau_ind = tau * ind\n", " tau_sparse_mat = tau * sparse_mat\n", " W = sample_factor_w(tau_sparse_mat, tau_ind, W, X, tau)\n", " X = sample_factor_x(tau_sparse_mat, tau_ind, W, X)\n", " mat_hat = W @ X.T\n", " tau = sample_precision_tau(sparse_mat, mat_hat, ind)\n", " temp_hat += mat_hat\n", " if (it + 1) % show_iter == 0 and it < burn_iter:\n", " temp_hat = temp_hat / show_iter\n", " print('Iter: {}'.format(it + 1))\n", " print('MAPE: {:.6}'.format(compute_mape(dense_test, temp_hat[pos_test])))\n", " print('RMSE: {:.6}'.format(compute_rmse(dense_test, temp_hat[pos_test])))\n", " temp_hat = np.zeros(sparse_mat.shape)\n", " print()\n", " if it + 1 > burn_iter:\n", " W_plus += W\n", " X_plus += X\n", " mat_hat_plus += mat_hat\n", " mat_hat = mat_hat_plus / gibbs_iter\n", " W = W_plus / gibbs_iter\n", " X = X_plus / gibbs_iter\n", " print('Imputation MAPE: {:.6}'.format(compute_mape(dense_test, mat_hat[pos_test])))\n", " print('Imputation RMSE: {:.6}'.format(compute_rmse(dense_test, mat_hat[pos_test])))\n", " print()\n", " \n", " return mat_hat, W, X" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation on Guangzhou Speed Data\n", "\n", "**Scenario setting**:\n", "\n", "- Tensor size: $214\\times 61\\times 144$ (road segment, day, time of day)\n", "- Non-random missing (NM)\n", "- 40% missing rate\n", "\n", "**Model setting**:\n", "\n", "- Low rank: 10\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import time\n", "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/Guangzhou-data-set/tensor.mat')['tensor']\n", "dim = dense_tensor.shape\n", "missing_rate = 0.4 # Non-random missing (NM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1])[:, :, np.newaxis] + 0.5 - missing_rate)\n", "dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "del dense_tensor, sparse_tensor\n", "\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 10\n", "init = {\"W\": 0.01 * np.random.randn(dim1, rank), \"X\": 0.01 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Scenario setting**:\n", "\n", "- Tensor size: $214\\times 61\\times 144$ (road segment, day, time of day)\n", "- Random missing (RM)\n", "- 40% missing rate\n", "\n", "**Model setting**:\n", "\n", "- Low rank: 80\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import time\n", "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/Guangzhou-data-set/tensor.mat')['tensor']\n", "dim = dense_tensor.shape\n", "missing_rate = 0.4 # Random missing (RM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)\n", "dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "del dense_tensor, sparse_tensor\n", "\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 80\n", "init = {\"W\": 0.01 * np.random.randn(dim1, rank), \"X\": 0.01 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Scenario setting**:\n", "\n", "- Tensor size: $214\\times 61\\times 144$ (road segment, day, time of day)\n", "- Random missing (RM)\n", "- 60% missing rate\n", "\n", "**Model setting**:\n", "\n", "- Low rank: 80\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import time\n", "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/Guangzhou-data-set/tensor.mat')['tensor']\n", "dim = dense_tensor.shape\n", "missing_rate = 0.6 # Random missing (RM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)\n", "dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "del dense_tensor, sparse_tensor\n", "\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 80\n", "init = {\"W\": 0.01 * np.random.randn(dim1, rank), \"X\": 0.01 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation on Birmingham Parking Data\n", "\n", "**Scenario setting**:\n", "\n", "- Tensor size: $30\\times 77\\times 18$ (parking slot, day, time of day)\n", "- Non-random missing (NM)\n", "- 40% missing rate\n", "\n", "**Model setting**:\n", "\n", "- Low rank: 20\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import time\n", "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/Birmingham-data-set/tensor.mat')['tensor']\n", "dim = dense_tensor.shape\n", "missing_rate = 0.4 # Non-random missing (NM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1])[:, :, np.newaxis] + 0.5 - missing_rate)\n", "dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "del dense_tensor, sparse_tensor\n", "\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 20\n", "init = {\"W\": 0.01 * np.random.randn(dim1, rank), \"X\": 0.01 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Scenario setting**:\n", "\n", "- Tensor size: $30\\times 77\\times 18$ (parking slot, day, time of day)\n", "- Random missing (RM)\n", "- 40% missing rate\n", "\n", "**Model setting**:\n", "\n", "- Low rank: 20\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "import time\n", "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/Birmingham-data-set/tensor.mat')['tensor']\n", "dim = dense_tensor.shape\n", "missing_rate = 0.4 # Random missing (RM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)\n", "dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "del dense_tensor, sparse_tensor\n", "\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 20\n", "init = {\"W\": 0.1 * np.random.randn(dim1, rank), \"X\": 0.1 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Scenario setting**:\n", "\n", "- Tensor size: $30\\times 77\\times 18$ (parking slot, day, time of day)\n", "- Random missing (RM)\n", "- 60% missing rate\n", "\n", "**Model setting**:\n", "\n", "- Low rank: 20\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "import time\n", "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/Birmingham-data-set/tensor.mat')['tensor']\n", "dim = dense_tensor.shape\n", "missing_rate = 0.6 # Random missing (RM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)\n", "dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "del dense_tensor, sparse_tensor\n", "\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 20\n", "init = {\"W\": 0.1 * np.random.randn(dim1, rank), \"X\": 0.1 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation on Hangzhou Flow Data\n", "\n", "**Scenario setting**:\n", "\n", "- Tensor size: $80\\times 25\\times 108$ (metro station, day, time of day)\n", "- Non-random missing (NM)\n", "- 40% missing rate\n", "\n", "**Model setting**:\n", "\n", "- Low rank: 30\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.349724\n", "RMSE: 50.3797\n", "\n", "Iter: 400\n", "MAPE: 0.353828\n", "RMSE: 52.0155\n", "\n", "Iter: 600\n", "MAPE: 0.352678\n", "RMSE: 49.2252\n", "\n", "Iter: 800\n", "MAPE: 0.353151\n", "RMSE: 45.6113\n", "\n", "Iter: 1000\n", "MAPE: 0.352523\n", "RMSE: 45.7271\n", "\n", "Imputation MAPE: 0.350175\n", "Imputation RMSE: 45.3692\n", "\n", "Running time: 243 seconds\n" ] } ], "source": [ "import time\n", "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/Hangzhou-data-set/tensor.mat')['tensor']\n", "dim = dense_tensor.shape\n", "missing_rate = 0.4 # Non-random missing (NM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1])[:, :, np.newaxis] + 0.5 - missing_rate)\n", "dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "del dense_tensor, sparse_tensor\n", "\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 30\n", "init = {\"W\": 0.01 * np.random.randn(dim1, rank), \"X\": 0.01 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Scenario setting**:\n", "\n", "- Tensor size: $80\\times 25\\times 108$ (metro station, day, time of day)\n", "- Random missing (RM)\n", "- 40% missing rate\n", "\n", "**Model setting**:\n", "\n", "- Low rank: 30\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.322692\n", "RMSE: 41.591\n", "\n", "Iter: 400\n", "MAPE: 0.323997\n", "RMSE: 41.9302\n", "\n", "Iter: 600\n", "MAPE: 0.324506\n", "RMSE: 41.8721\n", "\n", "Iter: 800\n", "MAPE: 0.322947\n", "RMSE: 41.7555\n", "\n", "Iter: 1000\n", "MAPE: 0.324867\n", "RMSE: 41.9841\n", "\n", "Imputation MAPE: 0.323612\n", "Imputation RMSE: 41.8382\n", "\n", "Running time: 234 seconds\n" ] } ], "source": [ "import time\n", "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/Hangzhou-data-set/tensor.mat')['tensor']\n", "dim = dense_tensor.shape\n", "missing_rate = 0.4 # Random missing (RM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)\n", "dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "del dense_tensor, sparse_tensor\n", "\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 30\n", "init = {\"W\": 0.1 * np.random.randn(dim1, rank), \"X\": 0.1 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Scenario setting**:\n", "\n", "- Tensor size: $80\\times 25\\times 108$ (metro station, day, time of day)\n", "- Random missing (RM)\n", "- 60% missing rate\n", "\n", "**Model setting**:\n", "\n", "- Low rank: 30\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.393414\n", "RMSE: 46.1231\n", "\n", "Iter: 400\n", "MAPE: 0.397607\n", "RMSE: 46.8354\n", "\n", "Iter: 600\n", "MAPE: 0.400496\n", "RMSE: 46.8306\n", "\n", "Iter: 800\n", "MAPE: 0.39948\n", "RMSE: 46.6026\n", "\n", "Iter: 1000\n", "MAPE: 0.400208\n", "RMSE: 46.7884\n", "\n", "Imputation MAPE: 0.398716\n", "Imputation RMSE: 46.6787\n", "\n", "Running time: 234 seconds\n" ] } ], "source": [ "import time\n", "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/Hangzhou-data-set/tensor.mat')['tensor']\n", "dim = dense_tensor.shape\n", "missing_rate = 0.6 # Random missing (RM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)\n", "dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "del dense_tensor, sparse_tensor\n", "\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 30\n", "init = {\"W\": 0.1 * np.random.randn(dim1, rank), \"X\": 0.1 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation on Seattle Speed Data\n", "\n", "**Scenario setting**:\n", "\n", "- Tensor size: $323\\times 28\\times 288$ (road segment, day, time of day)\n", "- Non-random missing (NM)\n", "- 40% missing rate\n", "\n", "**Model setting**:\n", "\n", "- Low rank: 10\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.0962377\n", "RMSE: 5.41434\n", "\n", "Iter: 400\n", "MAPE: 0.0918425\n", "RMSE: 5.30282\n", "\n", "Iter: 600\n", "MAPE: 0.0918725\n", "RMSE: 5.30348\n", "\n", "Iter: 800\n", "MAPE: 0.0918654\n", "RMSE: 5.30324\n", "\n", "Iter: 1000\n", "MAPE: 0.0918592\n", "RMSE: 5.30311\n", "\n", "Imputation MAPE: 0.0918528\n", "Imputation RMSE: 5.30302\n", "\n", "Running time: 441 seconds\n" ] } ], "source": [ "import time\n", "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/Seattle-data-set/tensor.npz')['arr_0']\n", "dim = dense_tensor.shape\n", "missing_rate = 0.4 # Non-random missing (NM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1])[:, :, np.newaxis] + 0.5 - missing_rate)\n", "dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "del dense_tensor, sparse_tensor\n", "\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 10\n", "init = {\"W\": 0.01 * np.random.randn(dim1, rank), \"X\": 0.01 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Scenario setting**:\n", "\n", "- Tensor size: $323\\times 28\\times 288$ (road segment, day, time of day)\n", "- Random missing (RM)\n", "- 40% missing rate\n", "\n", "**Model setting**:\n", "\n", "- Low rank: 50\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.0677314\n", "RMSE: 4.16611\n", "\n", "Iter: 400\n", "MAPE: 0.0678637\n", "RMSE: 4.18104\n", "\n", "Iter: 600\n", "MAPE: 0.067846\n", "RMSE: 4.18084\n", "\n", "Iter: 800\n", "MAPE: 0.0678551\n", "RMSE: 4.1807\n", "\n", "Iter: 1000\n", "MAPE: 0.0678554\n", "RMSE: 4.18122\n", "\n", "Imputation MAPE: 0.0678574\n", "Imputation RMSE: 4.1814\n", "\n", "Running time: 2302 seconds\n" ] } ], "source": [ "import time\n", "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/Seattle-data-set/tensor.npz')['arr_0']\n", "dim = dense_tensor.shape\n", "missing_rate = 0.4 # Random missing (RM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)\n", "dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "del dense_tensor, sparse_tensor\n", "\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 50\n", "init = {\"W\": 0.1 * np.random.randn(dim1, rank), \"X\": 0.1 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Scenario setting**:\n", "\n", "- Tensor size: $323\\times 28\\times 288$ (road segment, day, time of day)\n", "- Random missing (RM)\n", "- 60% missing rate\n", "\n", "**Model setting**:\n", "\n", "- Low rank: 50\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.0735971\n", "RMSE: 4.47002\n", "\n", "Iter: 400\n", "MAPE: 0.0737839\n", "RMSE: 4.49133\n", "\n", "Iter: 600\n", "MAPE: 0.0738166\n", "RMSE: 4.49302\n", "\n", "Iter: 800\n", "MAPE: 0.0738074\n", "RMSE: 4.49357\n", "\n", "Iter: 1000\n", "MAPE: 0.0737871\n", "RMSE: 4.49266\n", "\n", "Imputation MAPE: 0.0738219\n", "Imputation RMSE: 4.4948\n", "\n", "Running time: 2280 seconds\n" ] } ], "source": [ "import time\n", "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/Seattle-data-set/tensor.npz')['arr_0']\n", "dim = dense_tensor.shape\n", "missing_rate = 0.6 # Random missing (RM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)\n", "dense_mat = dense_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "sparse_mat = sparse_tensor.reshape([dim[0], dim[1] * dim[2]])\n", "del dense_tensor, sparse_tensor\n", "\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 50\n", "init = {\"W\": 0.1 * np.random.randn(dim1, rank), \"X\": 0.1 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation on London Movement Speed Data\n", "\n", "London movement speed data set is is a city-wide hourly traffic speeddataset collected in London.\n", "\n", "- Collected from 200,000+ road segments.\n", "- 720 time points in April 2019.\n", "- 73% missing values in the original data.\n", "\n", "| Observation rate | $>90\\%$ | $>80\\%$ | $>70\\%$ | $>60\\%$ | $>50\\%$ |\n", "|:------------------|--------:|--------:|--------:|--------:|--------:|\n", "|**Number of roads**| 17,666 | 27,148 | 35,912 | 44,352 | 52,727 |\n", "\n", "\n", "If want to test on the full dataset, you could consider the following setting for masking observations as missing values. \n", "\n", "```python\n", "import numpy as np\n", "np.random.seed(1000)\n", "mask_rate = 0.20\n", "\n", "dense_mat = np.load('../datasets/London-data-set/hourly_speed_mat.npy')\n", "pos_obs = np.where(dense_mat != 0)\n", "num = len(pos_obs[0])\n", "sample_ind = np.random.choice(num, size = int(mask_rate * num), replace = False)\n", "sparse_mat = dense_mat.copy()\n", "sparse_mat[pos_obs[0][sample_ind], pos_obs[1][sample_ind]] = 0\n", "```\n", "\n", "Notably, you could also consider to evaluate the model on a subset of the data with the following setting." ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "np.random.seed(1000)\n", "\n", "missing_rate = 0.4\n", "\n", "dense_mat = np.load('../datasets/London-data-set/hourly_speed_mat.npy')\n", "binary_mat = dense_mat.copy()\n", "binary_mat[binary_mat != 0] = 1\n", "pos = np.where(np.sum(binary_mat, axis = 1) > 0.7 * binary_mat.shape[1])\n", "dense_mat = dense_mat[pos[0], :]\n", "\n", "## Non-random missing (NM)\n", "binary_mat = np.zeros(dense_mat.shape)\n", "random_mat = np.random.rand(dense_mat.shape[0], 30)\n", "for i1 in range(dense_mat.shape[0]):\n", " for i2 in range(30):\n", " binary_mat[i1, i2 * 24 : (i2 + 1) * 24] = np.round(random_mat[i1, i2] + 0.5 - missing_rate)\n", "sparse_mat = np.multiply(dense_mat, binary_mat)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Model setting**:\n", "\n", "- Low rank: 20\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.0983183\n", "RMSE: 2.40201\n", "\n", "Iter: 400\n", "MAPE: 0.094693\n", "RMSE: 2.29785\n", "\n", "Iter: 600\n", "MAPE: 0.0946962\n", "RMSE: 2.29789\n", "\n", "Iter: 800\n", "MAPE: 0.0946931\n", "RMSE: 2.2979\n", "\n", "Iter: 1000\n", "MAPE: 0.0946954\n", "RMSE: 2.29787\n", "\n", "Imputation MAPE: 0.0946913\n", "Imputation RMSE: 2.29782\n", "\n", "Running time: 3566 seconds\n" ] } ], "source": [ "import time\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 20\n", "init = {\"W\": 0.01 * np.random.randn(dim1, rank), \"X\": 0.01 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "np.random.seed(1000)\n", "\n", "missing_rate = 0.4\n", "\n", "dense_mat = np.load('../datasets/London-data-set/hourly_speed_mat.npy')\n", "binary_mat = dense_mat.copy()\n", "binary_mat[binary_mat != 0] = 1\n", "pos = np.where(np.sum(binary_mat, axis = 1) > 0.7 * binary_mat.shape[1])\n", "dense_mat = dense_mat[pos[0], :]\n", "\n", "## Random missing (RM)\n", "random_mat = np.random.rand(dense_mat.shape[0], dense_mat.shape[1])\n", "binary_mat = np.round(random_mat + 0.5 - missing_rate)\n", "sparse_mat = np.multiply(dense_mat, binary_mat)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Model setting**:\n", "\n", "- Low rank: 20\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.0987723\n", "RMSE: 2.41736\n", "\n", "Iter: 400\n", "MAPE: 0.0914476\n", "RMSE: 2.21269\n", "\n", "Iter: 600\n", "MAPE: 0.0914455\n", "RMSE: 2.21263\n", "\n", "Iter: 800\n", "MAPE: 0.0914505\n", "RMSE: 2.21272\n", "\n", "Iter: 1000\n", "MAPE: 0.091447\n", "RMSE: 2.21266\n", "\n", "Imputation MAPE: 0.091447\n", "Imputation RMSE: 2.21267\n", "\n", "Running time: 3378 seconds\n" ] } ], "source": [ "import time\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 20\n", "init = {\"W\": 0.01 * np.random.randn(dim1, rank), \"X\": 0.01 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "np.random.seed(1000)\n", "\n", "missing_rate = 0.6\n", "\n", "dense_mat = np.load('../datasets/London-data-set/hourly_speed_mat.npy')\n", "binary_mat = dense_mat.copy()\n", "binary_mat[binary_mat != 0] = 1\n", "pos = np.where(np.sum(binary_mat, axis = 1) > 0.7 * binary_mat.shape[1])\n", "dense_mat = dense_mat[pos[0], :]\n", "\n", "## Random missing (RM)\n", "random_mat = np.random.rand(dense_mat.shape[0], dense_mat.shape[1])\n", "binary_mat = np.round(random_mat + 0.5 - missing_rate)\n", "sparse_mat = np.multiply(dense_mat, binary_mat)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Model setting**:\n", "\n", "- Low rank: 20\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.11639\n", "RMSE: 2.87399\n", "\n", "Iter: 400\n", "MAPE: 0.0928231\n", "RMSE: 2.24379\n", "\n", "Iter: 600\n", "MAPE: 0.0928241\n", "RMSE: 2.24376\n", "\n", "Iter: 800\n", "MAPE: 0.0928241\n", "RMSE: 2.24374\n", "\n", "Iter: 1000\n", "MAPE: 0.0928254\n", "RMSE: 2.2438\n", "\n", "Imputation MAPE: 0.0928232\n", "Imputation RMSE: 2.24375\n", "\n", "Running time: 3489 seconds\n" ] } ], "source": [ "import time\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 20\n", "init = {\"W\": 0.01 * np.random.randn(dim1, rank), \"X\": 0.01 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Eavluation on NYC Taxi Flow Data\n", "\n", "**Scenario setting**:\n", "\n", "- Tensor size: $30\\times 30\\times 1461$ (origin, destination, time)\n", "- Random missing (RM)\n", "- 40% missing rate\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/NYC-data-set/tensor.mat')['tensor'].astype(np.float32)\n", "dim = dense_tensor.shape\n", "missing_rate = 0.4 # Random missing (RM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)\n", "\n", "dim1, dim2, dim3 = dense_tensor.shape\n", "dense_mat = np.zeros((dim1 * dim2, dim3))\n", "sparse_mat = np.zeros((dim1 * dim2, dim3))\n", "for i in range(dim1):\n", " dense_mat[i * dim2 : (i + 1) * dim2, :] = dense_tensor[i, :, :]\n", " sparse_mat[i * dim2 : (i + 1) * dim2, :] = sparse_tensor[i, :, :]\n", "del dense_tensor, sparse_tensor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Model setting**:\n", "\n", "- Low rank: 30\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.446785\n", "RMSE: 4.63568\n", "\n", "Iter: 400\n", "MAPE: 0.449345\n", "RMSE: 4.70971\n", "\n", "Iter: 600\n", "MAPE: 0.449055\n", "RMSE: 4.70074\n", "\n", "Iter: 800\n", "MAPE: 0.449258\n", "RMSE: 4.71094\n", "\n", "Iter: 1000\n", "MAPE: 0.449475\n", "RMSE: 4.71905\n", "\n", "Imputation MAPE: 0.449309\n", "Imputation RMSE: 4.71647\n", "\n", "Running time: 315 seconds\n" ] } ], "source": [ "import time\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 30\n", "init = {\"W\": 0.1 * np.random.randn(dim1, rank), \"X\": 0.1 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Scenario setting**:\n", "\n", "- Tensor size: $30\\times 30\\times 1461$ (origin, destination, time)\n", "- Random missing (RM)\n", "- 60% missing rate\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/NYC-data-set/tensor.mat')['tensor'].astype(np.float32)\n", "dim = dense_tensor.shape\n", "missing_rate = 0.6 # Random missing (RM)\n", "sparse_tensor = dense_tensor * np.round(np.random.rand(dim[0], dim[1], dim[2]) + 0.5 - missing_rate)\n", "\n", "dim1, dim2, dim3 = dense_tensor.shape\n", "dense_mat = np.zeros((dim1 * dim2, dim3))\n", "sparse_mat = np.zeros((dim1 * dim2, dim3))\n", "for i in range(dim1):\n", " dense_mat[i * dim2 : (i + 1) * dim2, :] = dense_tensor[i, :, :]\n", " sparse_mat[i * dim2 : (i + 1) * dim2, :] = sparse_tensor[i, :, :]\n", "del dense_tensor, sparse_tensor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Model setting**:\n", "\n", "- Low rank: 30\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.461471\n", "RMSE: 4.846\n", "\n", "Iter: 400\n", "MAPE: 0.466909\n", "RMSE: 4.96785\n", "\n", "Iter: 600\n", "MAPE: 0.467616\n", "RMSE: 4.98044\n", "\n", "Iter: 800\n", "MAPE: 0.467748\n", "RMSE: 4.98002\n", "\n", "Iter: 1000\n", "MAPE: 0.467811\n", "RMSE: 4.98422\n", "\n", "Imputation MAPE: 0.467925\n", "Imputation RMSE: 4.989\n", "\n", "Running time: 290 seconds\n" ] } ], "source": [ "import time\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 30\n", "init = {\"W\": 0.1 * np.random.randn(dim1, rank), \"X\": 0.1 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Scenario setting**:\n", "\n", "- Tensor size: $30\\times 30\\times 1461$ (origin, destination, time)\n", "- Non-random missing (NM)\n", "- 40% missing rate\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import scipy.io\n", "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = scipy.io.loadmat('../datasets/NYC-data-set/tensor.mat')['tensor']\n", "dim = dense_tensor.shape\n", "nm_tensor = np.random.rand(dim[0], dim[1], dim[2])\n", "missing_rate = 0.4 # Non-random missing (NM)\n", "binary_tensor = np.zeros(dense_tensor.shape)\n", "for i1 in range(dim[0]):\n", " for i2 in range(dim[1]):\n", " for i3 in range(61):\n", " binary_tensor[i1, i2, i3 * 24 : (i3 + 1) * 24] = np.round(nm_tensor[i1, i2, i3] + 0.5 - missing_rate)\n", "sparse_tensor = dense_tensor * binary_tensor\n", "\n", "dim1, dim2, dim3 = dense_tensor.shape\n", "dense_mat = np.zeros((dim1 * dim2, dim3))\n", "sparse_mat = np.zeros((dim1 * dim2, dim3))\n", "for i in range(dim1):\n", " dense_mat[i * dim2 : (i + 1) * dim2, :] = dense_tensor[i, :, :]\n", " sparse_mat[i * dim2 : (i + 1) * dim2, :] = sparse_tensor[i, :, :]\n", "del dense_tensor, sparse_tensor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Model setting**:\n", "\n", "- Low rank: 30\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.450534\n", "RMSE: 4.70565\n", "\n", "Iter: 400\n", "MAPE: 0.453454\n", "RMSE: 4.78572\n", "\n", "Iter: 600\n", "MAPE: 0.453593\n", "RMSE: 4.78719\n", "\n", "Iter: 800\n", "MAPE: 0.453732\n", "RMSE: 4.79153\n", "\n", "Iter: 1000\n", "MAPE: 0.453568\n", "RMSE: 4.78587\n", "\n", "Imputation MAPE: 0.453621\n", "Imputation RMSE: 4.78941\n", "\n", "Running time: 286 seconds\n" ] } ], "source": [ "import time\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 30\n", "init = {\"W\": 0.1 * np.random.randn(dim1, rank), \"X\": 0.1 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation on Pacific Surface Temperature Data\n", "\n", "**Scenario setting**:\n", "\n", "- Tensor size: $30\\times 84\\times 396$ (location x, location y, month)\n", "- Random missing (RM)\n", "- 40% missing rate" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = np.load('../datasets/Temperature-data-set/tensor.npy').astype(np.float32)\n", "pos = np.where(dense_tensor[:, 0, :] > 50)\n", "dense_tensor[pos[0], :, pos[1]] = 0\n", "random_tensor = np.random.rand(dense_tensor.shape[0], dense_tensor.shape[1], dense_tensor.shape[2])\n", "missing_rate = 0.4\n", "\n", "## Random missing (RM)\n", "binary_tensor = np.round(random_tensor + 0.5 - missing_rate)\n", "sparse_tensor = dense_tensor.copy()\n", "sparse_tensor[binary_tensor == 0] = np.nan\n", "sparse_tensor[sparse_tensor == 0] = np.nan\n", "\n", "dim1, dim2, dim3 = dense_tensor.shape\n", "dense_mat = np.zeros((dim1 * dim2, dim3))\n", "sparse_mat = np.zeros((dim1 * dim2, dim3))\n", "for i in range(dim1):\n", " dense_mat[i * dim2 : (i + 1) * dim2, :] = dense_tensor[i, :, :]\n", " sparse_mat[i * dim2 : (i + 1) * dim2, :] = sparse_tensor[i, :, :]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Model setting**:\n", "\n", "- Low rank: 30\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.0138166\n", "RMSE: 0.460386\n", "\n", "Iter: 400\n", "MAPE: 0.0138145\n", "RMSE: 0.459798\n", "\n", "Iter: 600\n", "MAPE: 0.0138147\n", "RMSE: 0.459801\n", "\n", "Iter: 800\n", "MAPE: 0.0138142\n", "RMSE: 0.459815\n", "\n", "Iter: 1000\n", "MAPE: 0.0138138\n", "RMSE: 0.459805\n", "\n", "Imputation MAPE: 0.0138141\n", "Imputation RMSE: 0.459829\n", "\n", "Running time: 440 seconds\n" ] } ], "source": [ "import time\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 30\n", "init = {\"W\": 0.1 * np.random.randn(dim1, rank), \"X\": 0.1 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Scenario setting**:\n", "\n", "- Tensor size: $30\\times 84\\times 396$ (location x, location y, month)\n", "- Random missing (RM)\n", "- 60% missing rate" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = np.load('../datasets/Temperature-data-set/tensor.npy').astype(np.float32)\n", "pos = np.where(dense_tensor[:, 0, :] > 50)\n", "dense_tensor[pos[0], :, pos[1]] = 0\n", "random_tensor = np.random.rand(dense_tensor.shape[0], dense_tensor.shape[1], dense_tensor.shape[2])\n", "missing_rate = 0.6\n", "\n", "## Random missing (RM)\n", "binary_tensor = np.round(random_tensor + 0.5 - missing_rate)\n", "sparse_tensor = dense_tensor.copy()\n", "sparse_tensor[binary_tensor == 0] = np.nan\n", "sparse_tensor[sparse_tensor == 0] = np.nan\n", "\n", "dim1, dim2, dim3 = dense_tensor.shape\n", "dense_mat = np.zeros((dim1 * dim2, dim3))\n", "sparse_mat = np.zeros((dim1 * dim2, dim3))\n", "for i in range(dim1):\n", " dense_mat[i * dim2 : (i + 1) * dim2, :] = dense_tensor[i, :, :]\n", " sparse_mat[i * dim2 : (i + 1) * dim2, :] = sparse_tensor[i, :, :]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Model setting**:\n", "\n", "- Low rank: 30\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.0145174\n", "RMSE: 0.487207\n", "\n", "Iter: 400\n", "MAPE: 0.0144817\n", "RMSE: 0.485571\n", "\n", "Iter: 600\n", "MAPE: 0.0144834\n", "RMSE: 0.485601\n", "\n", "Iter: 800\n", "MAPE: 0.0144779\n", "RMSE: 0.485488\n", "\n", "Iter: 1000\n", "MAPE: 0.0144777\n", "RMSE: 0.485473\n", "\n", "Imputation MAPE: 0.0144806\n", "Imputation RMSE: 0.485545\n", "\n", "Running time: 378 seconds\n" ] } ], "source": [ "import time\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 30\n", "init = {\"W\": 0.1 * np.random.randn(dim1, rank), \"X\": 0.1 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Scenario setting**:\n", "\n", "- Tensor size: $30\\times 84\\times 396$ (location x, location y, month)\n", "- Non-random missing (NM)\n", "- 40% missing rate" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "np.random.seed(1000)\n", "\n", "dense_tensor = np.load('../datasets/Temperature-data-set/tensor.npy').astype(np.float32)\n", "pos = np.where(dense_tensor[:, 0, :] > 50)\n", "dense_tensor[pos[0], :, pos[1]] = 0\n", "random_tensor = np.random.rand(dense_tensor.shape[0], dense_tensor.shape[1], int(dense_tensor.shape[2] / 3))\n", "missing_rate = 0.4\n", "\n", "## Non-random missing (NM)\n", "binary_tensor = np.zeros(dense_tensor.shape)\n", "for i1 in range(dense_tensor.shape[0]):\n", " for i2 in range(dense_tensor.shape[1]):\n", " for i3 in range(int(dense_tensor.shape[2] / 3)):\n", " binary_tensor[i1, i2, i3 * 3 : (i3 + 1) * 3] = np.round(random_tensor[i1, i2, i3] + 0.5 - missing_rate)\n", "sparse_tensor = dense_tensor.copy()\n", "sparse_tensor[binary_tensor == 0] = np.nan\n", "sparse_tensor[sparse_tensor == 0] = np.nan\n", "\n", "dim1, dim2, dim3 = dense_tensor.shape\n", "dense_mat = np.zeros((dim1 * dim2, dim3))\n", "sparse_mat = np.zeros((dim1 * dim2, dim3))\n", "for i in range(dim1):\n", " dense_mat[i * dim2 : (i + 1) * dim2, :] = dense_tensor[i, :, :]\n", " sparse_mat[i * dim2 : (i + 1) * dim2, :] = sparse_tensor[i, :, :]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Model setting**:\n", "\n", "- Low rank: 30\n", "- The number of burn-in iterations: 1000\n", "- The number of Gibbs iterations: 200" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Iter: 200\n", "MAPE: 0.0143242\n", "RMSE: 0.475141\n", "\n", "Iter: 400\n", "MAPE: 0.0144314\n", "RMSE: 0.477967\n", "\n", "Iter: 600\n", "MAPE: 0.0144304\n", "RMSE: 0.477905\n", "\n", "Iter: 800\n", "MAPE: 0.0144277\n", "RMSE: 0.477888\n", "\n", "Iter: 1000\n", "MAPE: 0.0144317\n", "RMSE: 0.47798\n", "\n", "Imputation MAPE: 0.0144295\n", "Imputation RMSE: 0.477908\n", "\n", "Running time: 325 seconds\n" ] } ], "source": [ "import time\n", "start = time.time()\n", "dim1, dim2 = sparse_mat.shape\n", "rank = 30\n", "init = {\"W\": 0.1 * np.random.randn(dim1, rank), \"X\": 0.1 * np.random.randn(dim2, rank)}\n", "burn_iter = 1000\n", "gibbs_iter = 200\n", "mat_hat, W, X = BPMF(dense_mat, sparse_mat, init, rank, burn_iter, gibbs_iter)\n", "end = time.time()\n", "print('Running time: %d seconds'%(end - start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### License\n", "\n", "
\n", "This work is released under the MIT license.\n", "
" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 2 }