{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "importing Jupyter notebook from q1_softmax.ipynb\n", "importing Jupyter notebook from q2_gradcheck.ipynb\n", "importing Jupyter notebook from q2_sigmoid.ipynb\n" ] } ], "source": [ "import numpy as np\n", "import random\n", "import import_ipynb\n", "\n", "from q1_softmax import softmax\n", "from q2_gradcheck import gradcheck_naive\n", "from q2_sigmoid import sigmoid, sigmoid_grad" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# normalizeRows\n", "![](https://raw.githubusercontent.com/mmmwhy/picture/master/picgo/20190502103029.png)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def normalizeRows(x):\n", " \"\"\" Row normalization function\n", "\n", " Implement a function that normalizes each row of a matrix to have\n", " unit length.\n", " \"\"\"\n", "\n", " ### YOUR CODE HERE\n", " denom = np.linalg.norm(x,axis=1,keepdims=True)\n", " x = x/denom\n", " ### END YOUR CODE\n", "\n", " return x\n", "\n", "def test_normalize_rows():\n", " print(\"Testing normalizeRows...\")\n", " x = normalizeRows(np.array([[3.0,4.0],[1, 2]]))\n", " print(x)\n", " ans = np.array([[0.6,0.8],[0.4472136,0.89442719]])\n", " assert np.allclose(x, ans, rtol=1e-05, atol=1e-06)\n", " print(\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# softmaxCostAndGradient\n", "- $\\begin{align}\t\\hat{\\boldsymbol{y}}_{o} = p(\\boldsymbol{o} \\vert \\boldsymbol{c}) =\\frac{exp(\\boldsymbol{u}_{0}^{T} \\boldsymbol{v}_{c})}{\\sum\\limits_{w=1}^{W} exp(\\boldsymbol{u}_{w}^{T} \\boldsymbol{v}_{c})}\n", "\\end{align}$ 计算得到 `Pred`\n", "\n", "- $\\frac{\\partial J}{\\partial{v_c}} =\\frac{\\partial J}{\\partial \\boldsymbol{z}} \\frac{\\partial z}{\\partial v_c} = U(\\hat{\\boldsymbol{y}} -\\boldsymbol{y})$ \n", "\n", "- $\\frac{\\partial J}{\\partial{U}} =\\frac{\\partial J}{\\partial \\boldsymbol{z}} \\frac{\\partial z}{\\partial U} = v_c(\\hat{\\boldsymbol{y}} -\\boldsymbol{y})^{T}$" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def softmaxCostAndGradient(predicted, target, outputVectors, dataset):\n", " \"\"\" Softmax cost function for word2vec models\n", "\n", " Implement the cost and gradients for one predicted word vector\n", " and one target word vector as a building block for word2vec\n", " models, assuming the softmax prediction function and cross\n", " entropy loss.\n", "\n", " Arguments:\n", " predicted -- numpy ndarray, predicted word vector (\\hat{v} in\n", " the written component)\n", " target -- integer, the index of the target word\n", " outputVectors -- \"output\" vectors (as rows) for all tokens\n", " dataset -- needed for negative sampling, unused here.\n", "\n", " Return:\n", " cost -- cross entropy cost for the softmax word prediction\n", " gradPred -- the gradient with respect to the predicted word\n", " vector\n", " grad -- the gradient with respect to all the other word\n", " vectors\n", "\n", " We will not provide starter code for this function, but feel\n", " free to reference the code you previously wrote for this\n", " assignment!\n", " \"\"\"\n", "\n", " ### YOUR CODE HERE\n", " # target是指公式中下标为o的那个,在skipgram\n", " v_hat = predicted\n", " \n", " #注意到每行代表一个词向量\n", " Pred = softmax(np.dot(outputVectors, v_hat)) \n", " cost = -np.log(Pred[target])\n", " \n", " # \\hat{y} - y 的实现\n", " Pred[target] -= 1.\n", " # 关于V的梯度\n", " gradPred = np.dot(outputVectors.T, Pred)\n", " # 关于U的梯度,pred和v_hat都是向量,扩充为矩阵。\n", " grad = np.outer(Pred, v_hat)\n", " \n", " ### END YOUR CODE\n", " \n", " return cost, gradPred, grad" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def getNegativeSamples(target, dataset, K):\n", " \"\"\" Samples K indexes which are not the target \"\"\"\n", "\n", " indices = [None] * K\n", " for k in range(K):\n", " newidx = dataset.sampleTokenIdx()\n", " while newidx == target:\n", " newidx = dataset.sampleTokenIdx()\n", " indices[k] = newidx\n", " return indices" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# negSamplingCostAndGradient\n", "$\\begin{align} \\frac{\\partial J}{\\partial v_c}&=\\left(\\sigma(u_o^Tv_c)-1\\right)u_o-\\sum_{k=1}^K\\left(\\sigma(-u_k^Tv_c)-1\\right)u_k\\\\ \\frac{\\partial J}{\\partial u_o}&=\\left(\\sigma(u_o^Tv_c)-1\\right)v_c\\\\ \\frac{\\partial J}{\\partial u_k}&=-\\left(\\sigma(-u_k^Tv_c)-1\\right)v_c\\\\ \\end{align}$" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,\n", " K=10):\n", " \"\"\" Negative sampling cost function for word2vec models\n", "\n", " Implement the cost and gradients for one predicted word vector\n", " and one target word vector as a building block for word2vec\n", " models, using the negative sampling technique. K is the sample\n", " size.\n", "\n", " Note: See test_word2vec below for dataset's initialization.\n", "\n", " Arguments/Return Specifications: same as softmaxCostAndGradient\n", " \"\"\"\n", "\n", " # Sampling of indices is done for you. Do not modify this if you\n", " # wish to match the autograder and receive points!\n", " indices = [target]\n", " indices.extend(getNegativeSamples(target, dataset, K))\n", "\n", " ### YOUR CODE HERE\n", " grad = np.zeros(outputVectors.shape)\n", " gradPred =np.zeros(predicted.shape)\n", " cost = 0\n", " \n", " z = sigmoid(np.dot(outputVectors[target], predicted))\n", " cost -= np.log(z)\n", " grad[target] += predicted * (z - 1.0)\n", " gradPred += outputVectors[target] * (z-1.0)\n", " \n", " for k in range(K):\n", " sample = indices[k + 1]\n", " z = sigmoid(np.dot(outputVectors[sample], predicted))\n", " # sigmoid(x) = 1 - sigmoid(-x)\n", " cost -= np.log(1.0 - z)\n", " # sigmoid(-x) -1 = -sigmoid(x)\n", " grad[sample] += predicted * z\n", " gradPred += outputVectors[sample] * z\n", " \n", " \n", " ### END YOUR CODE\n", "\n", " return cost, gradPred, grad" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Skip-gram\n", "- 给定中间词,寻找周围的词。\n", "- 获得cost和梯度,更新词向量。即,把每一个词的梯度,都加起来。\n", "\n", "$\\begin{align}\t\\frac{J_{skip-gram}(word_{c-m \\dots c+m})}{\\partial \\boldsymbol{U}} &= \t\t\\sum\\limits_{-m \\leq j \\leq m, j \\ne 0} \\frac{\\partial F(\\boldsymbol{w}_{c+j}, \\boldsymbol{v}_{c})}{\\partial \\boldsymbol{U}} \\nonumber \\\\\t\\frac{J_{skip-gram}(word_{c-m \\dots c+m})}{\\partial \\boldsymbol{v}_{c}} &= \t\\sum\\limits_{-m \\leq j \\leq m, j \\ne 0} \\frac{\\partial F(\\boldsymbol{w}_{c+j}, \\boldsymbol{v}_{c})}{\\partial \\boldsymbol{v}_{c}} \\nonumber \\\\\t\\frac{J_{skip-gram}(word_{c-m \\dots c+m})}{\\partial \\boldsymbol{v}_{j}} &= 0, \\forall j\\ne c \\nonumber\\end{align}$" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def skipgram(currentWord, C, contextWords, tokens, inputVectors, outputVectors,\n", " dataset, word2vecCostAndGradient=softmaxCostAndGradient):\n", " \"\"\" Skip-gram model in word2vec\n", "\n", " Implement the skip-gram model in this function.\n", "\n", " Arguments:\n", " currrentWord -- a string of the current center word\n", " C -- integer, context size\n", " contextWords -- list of no more than 2*C strings, the context words\n", " tokens -- a dictionary that maps words to their indices in\n", " the word vector list\n", " inputVectors -- \"input\" word vectors (as rows) for all tokens\n", " outputVectors -- \"output\" word vectors (as rows) for all tokens\n", " word2vecCostAndGradient -- the cost and gradient function for\n", " a prediction vector given the target\n", " word vectors, could be one of the two\n", " cost functions you implemented above.\n", "\n", " Return:\n", " cost -- the cost function value for the skip-gram model\n", " grad -- the gradient with respect to the word vectors\n", " \"\"\"\n", "\n", " cost = 0.0\n", " gradIn = np.zeros(inputVectors.shape)\n", " gradOut = np.zeros(outputVectors.shape)\n", "\n", " ### YOUR CODE HERE\n", " cword_index = tokens[currentWord]\n", " vhat = inputVectors[cword_index]\n", " \n", " for j in contextWords:\n", " u_index = tokens[j] # target\n", " c_cost, c_grad_in, c_grad_out = \\\n", " word2vecCostAndGradient(vhat, u_index, outputVectors, dataset)\n", " cost += c_cost\n", " gradIn[cword_index] += c_grad_in\n", " gradOut += c_grad_out \n", " ### END YOUR CODE\n", "\n", " return cost, gradIn, gradOut" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# CBOW\n", "- 给定周围的字,发现中间的字。\n", "- 策略:把周围字的词向量加起来(为什么不是平均?),得到推测的中间字向量,与当前的中间字向量做更新。\n", "\n", "\n", "$\\begin{align}\t\\frac{J_{CBOW}(word_{c-m \\dots c+m})}{\\partial \\boldsymbol{U}}& = \\frac{\\partial F(\\boldsymbol{w}_{c}, \\hat{\\boldsymbol{v}})}{\\partial \\boldsymbol{U}} \\nonumber \\\\ \\frac{J_{CBOW}(word_{c-m \\dots c+m})}{\\partial \\boldsymbol{v}_{j}} &= \t\\frac{\\partial F(\\boldsymbol{w}_{c}, \\hat{\\boldsymbol{v}})}{\\partial \\hat{\\boldsymbol{v}}}, \\forall (j \\ne c) \\in \\{c-m \\dots c+m\\} \\nonumber \\\\ \\frac{J_{CBOW}(word_{c-m \\dots c+m})}{\\partial \\boldsymbol{v}_{j}} &= 0, \\forall (j \\ne c) \\notin \\{c-m \\dots c+m\\} \\nonumber\\end{align}$" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def cbow(currentWord, C, contextWords, tokens, inputVectors, outputVectors,\n", " dataset, word2vecCostAndGradient=softmaxCostAndGradient):\n", " \"\"\"CBOW model in word2vec\n", "\n", " Implement the continuous bag-of-words model in this function.\n", "\n", " Arguments/Return specifications: same as the skip-gram model\n", "\n", " Extra credit: Implementing CBOW is optional, but the gradient\n", " derivations are not. If you decide not to implement CBOW, remove\n", " the NotImplementedError.\n", " \"\"\"\n", "\n", " cost = 0.0\n", " gradIn = np.zeros(inputVectors.shape)\n", " gradOut = np.zeros(outputVectors.shape)\n", "\n", " ### YOUR CODE HERE\n", " predicted_indices = [tokens[word] for word in contextWords]\n", " predicted_vectors = inputVectors[predicted_indices]\n", " # 我记得笔记中提到的是做平均,这里待定。\n", " predicted = np.sum(predicted_vectors, axis=0)\n", " \n", " target = tokens[currentWord]\n", " cost,gradIn_predicted, gradOut = \\\n", " word2vecCostAndGradient(predicted, target, outputVectors, dataset)\n", " \n", " #注意下面是加,而不是赋值,因为同一个样本重复出现,山下文中可能出现相同的词汇\n", " for i in predicted_indices:\n", " gradIn[i] += gradIn_predicted \n", " \n", " ### END YOUR CODE\n", "\n", " return cost, gradIn, gradOut" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Testing normalizeRows...\n", "[[0.6 0.8 ]\n", " [0.4472136 0.89442719]]\n", "\n", "==== Gradient check for skip-gram ====\n", "Gradient check passed!\n", "Gradient check passed!\n", "\n", "==== Gradient check for CBOW ====\n", "Gradient check passed!\n", "Gradient check passed!\n", "\n", "=== Results ===\n", "(11.16610900153398, array([[ 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. ],\n", " [-1.26947339, -1.36873189, 2.45158957],\n", " [ 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. ]]), array([[-0.41045956, 0.18834851, 1.43272264],\n", " [ 0.38202831, -0.17530219, -1.33348241],\n", " [ 0.07009355, -0.03216399, -0.24466386],\n", " [ 0.09472154, -0.04346509, -0.33062865],\n", " [-0.13638384, 0.06258276, 0.47605228]]))\n", "(14.093692760899629, array([[ 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. ],\n", " [-3.86802836, -1.12713967, -1.52668625],\n", " [ 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. ]]), array([[-0.11265089, 0.05169237, 0.39321163],\n", " [-0.22716495, 0.10423969, 0.79292674],\n", " [-0.79674766, 0.36560539, 2.78107395],\n", " [-0.31602611, 0.14501561, 1.10309954],\n", " [-0.80620296, 0.36994417, 2.81407799]]))\n", "(0.7989958010906648, array([[ 0.23330542, -0.51643128, -0.8281311 ],\n", " [ 0.11665271, -0.25821564, -0.41406555],\n", " [ 0.11665271, -0.25821564, -0.41406555],\n", " [ 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. ]]), array([[ 0.80954933, 0.21962514, -0.54095764],\n", " [-0.03556575, -0.00964874, 0.02376577],\n", " [-0.13016109, -0.0353118 , 0.08697634],\n", " [-0.1650812 , -0.04478539, 0.11031068],\n", " [-0.47874129, -0.1298792 , 0.31990485]]))\n", "(7.89559320359914, array([[-2.98873309, -3.38440688, -2.62676289],\n", " [-1.49436655, -1.69220344, -1.31338145],\n", " [-1.49436655, -1.69220344, -1.31338145],\n", " [ 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. ]]), array([[ 0.21992784, 0.0596649 , -0.14696034],\n", " [-1.37825047, -0.37390982, 0.92097553],\n", " [-0.77702167, -0.21080061, 0.51922198],\n", " [-2.58955401, -0.7025281 , 1.73039366],\n", " [-2.36749007, -0.64228369, 1.58200593]]))\n" ] } ], "source": [ "#############################################\n", "# Testing functions below. DO NOT MODIFY! #\n", "#############################################\n", "\n", "def word2vec_sgd_wrapper(word2vecModel, tokens, wordVectors, dataset, C,\n", " word2vecCostAndGradient=softmaxCostAndGradient):\n", " batchsize = 50\n", " cost = 0.0\n", " grad = np.zeros(wordVectors.shape)\n", " N = wordVectors.shape[0]\n", " inputVectors = wordVectors[:int(N/2),:]\n", " outputVectors = wordVectors[int(N/2):,:]\n", " for i in range(batchsize):\n", " C1 = random.randint(1,C)\n", " centerword, context = dataset.getRandomContext(C1)\n", "\n", " if word2vecModel == skipgram:\n", " denom = 1\n", " else:\n", " denom = 1\n", "\n", " c, gin, gout = word2vecModel(\n", " centerword, C1, context, tokens, inputVectors, outputVectors,\n", " dataset, word2vecCostAndGradient)\n", " cost += c / batchsize / denom\n", " grad[:int(N/2), :] += gin / batchsize / denom\n", " grad[int(N/2):, :] += gout / batchsize / denom\n", "\n", " return cost, grad\n", "def test_word2vec():\n", " \"\"\" Interface to the dataset for negative sampling \"\"\"\n", " dataset = type('dummy', (), {})()\n", " def dummySampleTokenIdx():\n", " return random.randint(0, 4)\n", "\n", " def getRandomContext(C):\n", " tokens = [\"a\", \"b\", \"c\", \"d\", \"e\"]\n", " return tokens[random.randint(0,4)], \\\n", " [tokens[random.randint(0,4)] for i in range(2*C)]\n", " dataset.sampleTokenIdx = dummySampleTokenIdx\n", " dataset.getRandomContext = getRandomContext\n", "\n", " random.seed(31415)\n", " np.random.seed(9265)\n", " dummy_vectors = normalizeRows(np.random.randn(10,3))\n", " dummy_tokens = dict([(\"a\",0), (\"b\",1), (\"c\",2),(\"d\",3),(\"e\",4)])\n", " print (\"==== Gradient check for skip-gram ====\")\n", " gradcheck_naive(lambda vec: word2vec_sgd_wrapper(\n", " skipgram, dummy_tokens, vec, dataset, 5, softmaxCostAndGradient),\n", " dummy_vectors)\n", " gradcheck_naive(lambda vec: word2vec_sgd_wrapper(\n", " skipgram, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient),\n", " dummy_vectors)\n", " print (\"\\n==== Gradient check for CBOW ====\")\n", " gradcheck_naive(lambda vec: word2vec_sgd_wrapper(\n", " cbow, dummy_tokens, vec, dataset, 5, softmaxCostAndGradient),\n", " dummy_vectors)\n", " gradcheck_naive(lambda vec: word2vec_sgd_wrapper(\n", " cbow, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient),\n", " dummy_vectors)\n", "\n", " print (\"\\n=== Results ===\")\n", " print (skipgram(\"c\", 3, [\"a\", \"b\", \"e\", \"d\", \"b\", \"c\"],\n", " dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset))\n", " print (skipgram(\"c\", 1, [\"a\", \"b\"],\n", " dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset,\n", " negSamplingCostAndGradient))\n", " print (cbow(\"a\", 2, [\"a\", \"b\", \"c\", \"a\"],\n", " dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset))\n", " print (cbow(\"a\", 2, [\"a\", \"b\", \"a\", \"c\"],\n", " dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset,\n", " negSamplingCostAndGradient))\n", "\n", "\n", "if __name__ == \"__main__\":\n", " test_normalize_rows()\n", " test_word2vec()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }