{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# A hands-on introduction to RBM (from a recommendation system point of view)\n", "\n", "### Credit\n", "- [blog](http://blog.echen.me/2011/07/18/introduction-to-restricted-boltzmann-machines/)\n", "\n", "- [source code](https://github.com/echen/restricted-boltzmann-machines)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "## Digest the code here\n", "import numpy as np\n", "from IPython.core.display import display, HTML\n", "\n", "def logistic(x):\n", " return 1. / (1 + np.exp(-x))\n", "\n", "data = np.asarray([\n", " [1, 1, 1, 0], \n", " [1, 0, 1, 0],\n", " [1, 1, 1, 0],\n", " [0, 0, 1, 1],\n", " [0, 0, 1, 1],\n", " [0, 0, 1, 1]])\n", "nsample, nvis = data.shape\n", "nhid = 2\n", "print \"\\n>> nsample = %i, nvis = %i, nhid = %i\" % (nsample, nvis, nhid)\n", "print \"\\n>> Original data design matrix (nsample x nvis)\"\n", "display(data)\n", "print \"\\n>> augument data input vectors (first all-one column)\"\n", "data = np.insert(data, 0, 1, axis = 1)\n", "display(data)\n", "\n", "weights = np.random.randn(nvis, nhid)\n", "print \"\\n>> initial weights, shape = collections of column weight vectors\"\n", "print \"weight vectors shape-match input vectors\"\n", "print \"number of weight vectors = number of hidden nodes\"\n", "display(weights)\n", "weights = np.insert(weights, 0, 0, axis = 0)\n", "weights = np.insert(weights, 0, 0, axis = 1)\n", "print \"\\n>> padding the weight matrix with 0s on 1st row and column\"\n", "print \"adding 1st row because now the weight vector should MATCH AUGUMENTED input vector\"\n", "print \"adding 1st col because we need another vector ESSENTIALLY another hidden node\"\n", "print \"IN SUMMARY: we augument both weight and input vector, and always add another hidden node\"\n", "display(weights)\n", "\n", "print \"\\n\" + \"=\" * 30 + \"FORWARD PHASE\" + \"=\" * 30\n", "print \"where data flows from visible layer to hidden layer\"\n", "\n", "print \"\\n>> Given the shapes of weight and design matrices\"\n", "print \"the activation of hidden nodes are actually done as dot of design x weights\"\n", "print \"and the shape of the activations will be nsample x (nhid + 1) - one bias hidden node\"\n", "print \"and the activation of the bias hidden unit is always off initially \"\n", "activations = np.dot(data, weights)\n", "display(activations)\n", "\n", "print \"\\n>> The probability of hidden nodes will be the elementwise logistic transformation of activations\"\n", "print \"NOTE it is NOT a softmax setting so probs of all hidden nodes for certain input dont sum to 1\"\n", "print \"In other words, different nodes are not EXCLUSIVE of each other\"\n", "probs = logistic(activations)\n", "display(probs)\n", "\n", "print \"\\n>> The hidden states (still nsample x nhidden+1) will be binary\"\n", "print \"and it is calculated based on hidden probs compared to a uniform random threshold (0 ~ 1)\"\n", "print \"by this, we add the flavor of uncertainty,\"\n", "print \"and higher prob values should be more likely to be on than lower values\"\n", "states = probs > np.random.rand(nsample, nhid + 1, )\n", "display(states)\n", "\n", "print \"\\n>> Association (ninput+1 x nhidden+1) - OUTER PRODUCT of INPUTS and ACTIVATIONS\"\n", "print \"Here is calcluated based on input and probs (instead of states)\"\n", "print \"So that the association is a smooth one instead of binary, since it is an outer product\"\n", "print \"it serves as a similiarity matrix (similiar to covariance matrix)\"\n", "print \"\"\"Intuitively it can be understood the LINK between an INPUT node and an HIDDEN node \n", "(on or off at the same time) by aggregatin all samples\"\"\"\n", "associations = np.dot(data.T, probs)\n", "display(associations)\n", "\n", "print \"\\n\" + \"=\" * 30 + \"BACKWARD PHASE\" + \"=\" * 30\n", "print \"where data flows from hidden layer back to visible layer\"\n", "\n", "print \"\\n>> Visible activation (nsample x nvis+1)\"\n", "print \"assume the Hidden States (not probs) as new design matrix (BINARY AGAIN)\"\n", "print \"and transpose of forward weights as new weights\"\n", "visible_activations = np.dot(states, weights.T)\n", "display(visible_activations)\n", "print \"\\n>> Calculate the firing chance of visible nodes (nsample x nvis+1) in the same way (logistic transf.)\"\n", "visible_probs = logistic(visible_activations)\n", "display(visible_probs)\n", "print \"But this time fix the bias unit by setting all its (1st col) firing chance as 1\"\n", "print \"So that the REFLECTED visible node values are directly comparable with AUGUMENTED inputs\"\n", "visible_probs[:, 0] = 1\n", "display(visible_probs)\n", "print \"\\n>> But it wont stop here when learning the weights\"\n", "print \"We need use the FAKED(REFLECTED) visible inputs and put it forward again\"\n", "print \"just to get the association between the FAKED inputs and hidden nodes again\"\n", "print \"And remember the faked inputs this time is the fixed visible_probs\"\n", "faked_activations = np.dot(visible_probs, weights)\n", "display(faked_activations)\n", "print \"\\n>> And then the faked probs of activations\"\n", "faked_probs = logistic(faked_activations)\n", "display(faked_probs)\n", "print \"\\n>> And then the association between faked inputs and faked hidden probs\"\n", "faked_associations = np.dot(visible_probs.T, faked_probs)\n", "display(faked_associations)\n", "\n", "print \"\\n\" + \"=\" * 30 + \"LEARNING PHASE\" + \"=\" * 30\n", "print \"where the weights are updated based on forward association between input and hidden probs\"\n", "print \"and backward association between REFLECTED input and FAKED hidden probs\"\n", "learning_rate = 0.1\n", "weights += learning_rate * ((associations - faked_associations) / nsample)\n", "print \">> The new weights - NOTE the bias unit got updated as well\"\n", "display(weights)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", ">> nsample = 6, nvis = 4, nhid = 2\n", "\n", ">> Original data design matrix (nsample x nvis)\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[1, 1, 1, 0],\n", " [1, 0, 1, 0],\n", " [1, 1, 1, 0],\n", " [0, 0, 1, 1],\n", " [0, 0, 1, 1],\n", " [0, 0, 1, 1]])" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", ">> augument data input vectors (first all-one column)\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[1, 1, 1, 1, 0],\n", " [1, 1, 0, 1, 0],\n", " [1, 1, 1, 1, 0],\n", " [1, 0, 0, 1, 1],\n", " [1, 0, 0, 1, 1],\n", " [1, 0, 0, 1, 1]])" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", ">> initial weights, shape = collections of column weight vectors\n", "weight vectors shape-match input vectors\n", "number of weight vectors = number of hidden nodes\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[ 1.06162553, 0.89301881],\n", " [ 0.03644388, 1.16875729],\n", " [ 1.16552839, -1.00166981],\n", " [ 0.97883434, -0.47188823]])" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", ">> padding the weight matrix with 0s on 1st row and column\n", "adding 1st row because now the weight vector should MATCH AUGUMENTED input vector\n", "adding 1st col because we need another vector ESSENTIALLY another hidden node\n", "IN SUMMARY: we augument both weight and input vector, and always add another hidden node\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[ 0. , 0. , 0. ],\n", " [ 0. , 1.06162553, 0.89301881],\n", " [ 0. , 0.03644388, 1.16875729],\n", " [ 0. , 1.16552839, -1.00166981],\n", " [ 0. , 0.97883434, -0.47188823]])" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "==============================FORWARD PHASE==============================\n", "where data flows from visible layer to hidden layer\n", "\n", ">> Given the shapes of weight and design matrices\n", "the activation of hidden nodes are actually done as dot of design x weights\n", "and the shape of the activations will be nsample x (nhid + 1) - one bias hidden node\n", "and the activation of the bias hidden unit is always off initially \n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[ 0. , 2.2635978 , 1.06010629],\n", " [ 0. , 2.22715393, -0.10865099],\n", " [ 0. , 2.2635978 , 1.06010629],\n", " [ 0. , 2.14436273, -1.47355803],\n", " [ 0. , 2.14436273, -1.47355803],\n", " [ 0. , 2.14436273, -1.47355803]])" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", ">> The probability of hidden nodes will be the elementwise logistic transformation of activations\n", "NOTE it is NOT a softmax setting so probs of all hidden nodes for certain input dont sum to 1\n", "In other words, different nodes are not EXCLUSIVE of each other\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[ 0.5 , 0.90581702, 0.74271086],\n", " [ 0.5 , 0.90266158, 0.47286394],\n", " [ 0.5 , 0.90581702, 0.74271086],\n", " [ 0.5 , 0.89514082, 0.18640241],\n", " [ 0.5 , 0.89514082, 0.18640241],\n", " [ 0.5 , 0.89514082, 0.18640241]])" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", ">> The hidden states (still nsample x nhidden+1) will be binary\n", "and it is calculated based on hidden probs compared to a uniform random threshold (0 ~ 1)\n", "by this, we add the flavor of uncertainty,\n", "and higher prob values should be more likely to be on than lower values\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[ True, True, True],\n", " [False, True, False],\n", " [ True, True, True],\n", " [ True, False, False],\n", " [ True, True, False],\n", " [False, True, True]], dtype=bool)" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", ">> Association (ninput+1 x nhidden+1) - OUTER PRODUCT of INPUTS and ACTIVATIONS\n", "Here is calcluated based on input and probs (instead of states)\n", "So that the association is a smooth one instead of binary, since it is an outer product\n", "it serves as a similiarity matrix (similiar to covariance matrix)\n", "Intuitively it can be understood the LINK between an INPUT node and an HIDDEN node \n", "(on or off at the same time) by aggregatin all samples\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[ 3. , 5.39971807, 2.5174929 ],\n", " [ 1.5 , 2.71429561, 1.95828566],\n", " [ 1. , 1.81163403, 1.48542172],\n", " [ 3. , 5.39971807, 2.5174929 ],\n", " [ 1.5 , 2.68542246, 0.55920724]])" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "==============================BACKWARD PHASE==============================\n", "where data flows from hidden layer back to visible layer\n", "\n", ">> Visible activation (nsample x nvis+1)\n", "assume the Hidden States (not probs) as new design matrix (BINARY AGAIN)\n", "and transpose of forward weights as new weights\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[ 0. , 1.95464435, 1.20520116, 0.16385859, 0.50694611],\n", " [ 0. , 1.06162553, 0.03644388, 1.16552839, 0.97883434],\n", " [ 0. , 1.95464435, 1.20520116, 0.16385859, 0.50694611],\n", " [ 0. , 0. , 0. , 0. , 0. ],\n", " [ 0. , 1.06162553, 0.03644388, 1.16552839, 0.97883434],\n", " [ 0. , 1.95464435, 1.20520116, 0.16385859, 0.50694611]])" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", ">> Calculate the firing chance of visible nodes (nsample x nvis+1) in the same way (logistic transf.)\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[ 0.5 , 0.87595218, 0.76944875, 0.54087323, 0.6240903 ],\n", " [ 0.5 , 0.74300106, 0.50910996, 0.7623358 , 0.72687686],\n", " [ 0.5 , 0.87595218, 0.76944875, 0.54087323, 0.6240903 ],\n", " [ 0.5 , 0.5 , 0.5 , 0.5 , 0.5 ],\n", " [ 0.5 , 0.74300106, 0.50910996, 0.7623358 , 0.72687686],\n", " [ 0.5 , 0.87595218, 0.76944875, 0.54087323, 0.6240903 ]])" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "But this time fix the bias unit by setting all its (1st col) firing chance as 1\n", "So that the REFLECTED visible node values are directly comparable with AUGUMENTED inputs\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[ 1. , 0.87595218, 0.76944875, 0.54087323, 0.6240903 ],\n", " [ 1. , 0.74300106, 0.50910996, 0.7623358 , 0.72687686],\n", " [ 1. , 0.87595218, 0.76944875, 0.54087323, 0.6240903 ],\n", " [ 1. , 0.5 , 0.5 , 0.5 , 0.5 ],\n", " [ 1. , 0.74300106, 0.50910996, 0.7623358 , 0.72687686],\n", " [ 1. , 0.87595218, 0.76944875, 0.54087323, 0.6240903 ]])" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", ">> But it wont stop here when learning the weights\n", "We need use the FAKED(REFLECTED) visible inputs and put it forward again\n", "just to get the association between the FAKED inputs and hidden nodes again\n", "And remember the faked inputs this time is the fixed visible_probs\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[ 0. , 2.19925902, 0.84526335],\n", " [ 0. , 2.4073589 , 0.15192652],\n", " [ 0. , 2.19925902, 0.84526335],\n", " [ 0. , 1.62121607, 0.29410903],\n", " [ 0. , 2.4073589 , 0.15192652],\n", " [ 0. , 2.19925902, 0.84526335]])" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", ">> And then the faked probs of activations\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[ 0.5 , 0.90018295, 0.69957258],\n", " [ 0.5 , 0.91738674, 0.53790874],\n", " [ 0.5 , 0.90018295, 0.69957258],\n", " [ 0.5 , 0.83496277, 0.57300179],\n", " [ 0.5 , 0.91738674, 0.53790874],\n", " [ 0.5 , 0.90018295, 0.69957258]])" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", ">> And then the association between faked inputs and faked hidden probs\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[ 3. , 5.3702851 , 3.74753702],\n", " [ 2.30692933, 4.14627168, 2.92421081],\n", " [ 1.91328308, 3.42951677, 2.44906603],\n", " [ 1.82364565, 3.27684949, 2.24177533],\n", " [ 1.91301231, 3.43652212, 2.37827712]])" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "==============================LEARNING PHASE==============================\n", "where the weights are updated based on forward association between input and hidden probs\n", "and backward association between REFLECTED input and FAKED hidden probs\n", ">> The new weights - NOTE the bias unit got updated as well\n" ] }, { "metadata": {}, "output_type": "display_data", "text": [ "array([[ 0.00000000e+00, 4.90549500e-04, -2.05007353e-02],\n", " [ -1.34488222e-02, 1.03775927e+00, 8.76920061e-01],\n", " [ -1.52213847e-02, 9.47916361e-03, 1.15269655e+00],\n", " [ 1.96059058e-02, 1.20090953e+00, -9.97074513e-01],\n", " [ -6.88353854e-03, 9.66316013e-01, -5.02206058e-01]])" ] } ], "prompt_number": 54 }, { "cell_type": "code", "collapsed": false, "input": [ " \n", "## RBM Class\n", "\n", "import numpy as np\n", "\n", "class RBM(object):\n", " def __init__(self, num_visible, num_hidden, learning_rate = 0.1):\n", " self.num_visible = num_visible\n", " self.num_hidden = num_hidden\n", " self.learning_rate = learning_rate\n", " \n", " ## initialize the weight matrix, of nvis x nhid, using\n", " ## a Gaussian distribution with mean = 0, std = 0.1\n", " self.weights = 0.1 * np.random.randn(self.num_visible, self.num_hidden)\n", " ## add bias units into the weights (first row and first column)\n", " self.weights = np.insert(self.weights, 0, 0.0, axis = 0)\n", " self.weights = np.insert(self.weights, 0, 0.0, axis = 1, )\n", " def train(self, data, max_epochs = 1000):\n", " \"\"\"data : design matrix\"\"\"\n", " num_examples = data.shape[0]\n", " ## augument the design matrix\n", " data = np.insert(data, 0, 1.0, axis = 1)\n", " for epoch in range(max_epochs):\n", " ## Clamp to the data and sample from the hidden units\n", " pos_hidden_activations = np.dot(data, self.weights)\n", " pos_hidden_probs = self._logistic(pos_hidden_activations)\n", " pos_hidden_states = pos_hidden_probs > np.random.rand(num_examples, self.num_hidden+1)\n", " ## Note to use activation probabilites of the hidden states, not the hidden states\n", " ## themselves to compute associations. We could also use the states, see sectoin 3\n", " ## of \"A practical Guide to Training RBM\" for details\n", " pos_associations = np.dot(data.T, pos_hidden_probs)\n", " \n", " ## Reconstruct the visible units and sample again from the hidden units\n", " neg_visible_activations = np.dot(pos_hidden_states, self.weights.T)\n", " neg_visible_probs = self._logistic(neg_visible_activations)\n", " neg_visible_probs[:, 0] = 1 ## fix the bias unit\n", " neg_hidden_activations = np.dot(neg_visible_probs, self.weights)\n", " neg_hidden_probs = self._logistic(neg_hidden_activations)\n", " neg_associations = np.dot(neg_visible_probs.T, neg_hidden_probs)\n", " \n", " ## update weights based on positive and negtive associations\n", " self.weights += self.learning_rate * (pos_associations - neg_associations) / num_examples\n", " error = np.sum((data - neg_visible_probs) ** 2)\n", " #print 'Epoch %s: error is %s' % (epoch, error)\n", " def run_visible(self, data):\n", " pass\n", " def run_hidden(self, data):\n", " pass\n", " def daydream(self, data):\n", " pass\n", " def _logistic(self, x):\n", " return 1.0 / (1 + np.exp(-x))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 67 }, { "cell_type": "code", "collapsed": false, "input": [ "## Use it\n", "import pandas as pd\n", "def print_weights(wt_values):\n", " wts = pd.DataFrame(wt_values, \n", " index = [\"Bias Unit\",\n", " \"Harry Potter\",\n", " \"Avatar\",\n", " \"LOTR 3\",\n", " \"Gladiator\",\n", " \"Titanic\",\n", " \"Glitter\"], \n", " columns = [\"Bias Unit\",\n", " \"Hidden 1\",\n", " \"Hidden 2\",\n", " \"Hidden 3\"])\n", " display(wts)\n", "\n", "if __name__ == '__main__':\n", " r = RBM(num_visible = 6, num_hidden = 3)\n", " \n", " training_data = np.asarray([\n", " [1, 1, 1, 0, 0, 0], \n", " [1, 0, 1, 0, 0, 0],\n", " [1, 1, 1, 0, 0, 0],\n", " [0, 0, 1, 1, 1, 0],\n", " [0, 0, 1, 1, 0, 0],\n", " [0, 0, 1, 1, 1, 0]\n", " ])\n", " \"\"\"\n", " training_data = np.asarray([\n", " [1, 1, 1, 0, 0, 1], \n", " [1, 0, 1, 0, 0, 0],\n", " [1, 1, 1, 0, 0, 0],\n", " [0, 0, 1, 1, 1, 1],\n", " [0, 0, 1, 1, 0, 1],\n", " [0, 0, 1, 1, 1, 1]\n", " ])\n", " \"\"\"\n", " r.train(training_data, max_epochs = 10000)\n", " print_weights(r.weights)\n", " user = np.asarray([[0, 0, 0, 1, 1, 0]])\n", " print r.run_visible(user)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | Bias Unit | \n", "Hidden 1 | \n", "Hidden 2 | \n", "Hidden 3 | \n", "
---|---|---|---|---|
Bias Unit | \n", "0.458353 | \n", "-0.798174 | \n", "1.268165 | \n", "-0.394543 | \n", "
Harry Potter | \n", "-0.667807 | \n", "-7.777213 | \n", "2.204765 | \n", "4.605650 | \n", "
Avatar | \n", "-8.755023 | \n", "-4.857809 | \n", "1.970422 | \n", "2.654181 | \n", "
LOTR 3 | \n", "4.113024 | \n", "2.564658 | \n", "4.080586 | \n", "2.267659 | \n", "
Gladiator | \n", "0.800234 | \n", "7.249336 | \n", "-1.222727 | \n", "-5.582715 | \n", "
Titanic | \n", "1.005019 | \n", "3.959994 | \n", "-9.195495 | \n", "-3.453264 | \n", "
Glitter | \n", "-3.630298 | \n", "-3.191309 | \n", "-3.269856 | \n", "-3.059602 | \n", "