{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "importing Jupyter notebook from q1_softmax.ipynb\n",
      "importing Jupyter notebook from q2_gradcheck.ipynb\n",
      "importing Jupyter notebook from q2_sigmoid.ipynb\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import random\n",
    "import import_ipynb\n",
    "\n",
    "from q1_softmax import softmax\n",
    "from q2_gradcheck import gradcheck_naive\n",
    "from q2_sigmoid import sigmoid, sigmoid_grad"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# normalizeRows\n",
    "![](https://raw.githubusercontent.com/mmmwhy/picture/master/picgo/20190502103029.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalizeRows(x):\n",
    "    \"\"\" Row normalization function\n",
    "\n",
    "    Implement a function that normalizes each row of a matrix to have\n",
    "    unit length.\n",
    "    \"\"\"\n",
    "\n",
    "    ### YOUR CODE HERE\n",
    "    denom = np.linalg.norm(x,axis=1,keepdims=True)\n",
    "    x = x/denom\n",
    "    ### END YOUR CODE\n",
    "\n",
    "    return x\n",
    "\n",
    "def test_normalize_rows():\n",
    "    print(\"Testing normalizeRows...\")\n",
    "    x = normalizeRows(np.array([[3.0,4.0],[1, 2]]))\n",
    "    print(x)\n",
    "    ans = np.array([[0.6,0.8],[0.4472136,0.89442719]])\n",
    "    assert np.allclose(x, ans, rtol=1e-05, atol=1e-06)\n",
    "    print(\"\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# softmaxCostAndGradient\n",
    "- $\\begin{align}\t\\hat{\\boldsymbol{y}}_{o} = p(\\boldsymbol{o} \\vert \\boldsymbol{c}) =\\frac{exp(\\boldsymbol{u}_{0}^{T} \\boldsymbol{v}_{c})}{\\sum\\limits_{w=1}^{W} exp(\\boldsymbol{u}_{w}^{T} \\boldsymbol{v}_{c})}\n",
    "\\end{align}$ 计算得到 `Pred`\n",
    "\n",
    "- $\\frac{\\partial J}{\\partial{v_c}} =\\frac{\\partial J}{\\partial \\boldsymbol{z}} \\frac{\\partial z}{\\partial v_c} = U(\\hat{\\boldsymbol{y}} -\\boldsymbol{y})$ \n",
    "\n",
    "- $\\frac{\\partial J}{\\partial{U}} =\\frac{\\partial J}{\\partial \\boldsymbol{z}} \\frac{\\partial z}{\\partial U} = v_c(\\hat{\\boldsymbol{y}} -\\boldsymbol{y})^{T}$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def softmaxCostAndGradient(predicted, target, outputVectors, dataset):\n",
    "    \"\"\" Softmax cost function for word2vec models\n",
    "\n",
    "    Implement the cost and gradients for one predicted word vector\n",
    "    and one target word vector as a building block for word2vec\n",
    "    models, assuming the softmax prediction function and cross\n",
    "    entropy loss.\n",
    "\n",
    "    Arguments:\n",
    "    predicted -- numpy ndarray, predicted word vector (\\hat{v} in\n",
    "                 the written component)\n",
    "    target -- integer, the index of the target word\n",
    "    outputVectors -- \"output\" vectors (as rows) for all tokens\n",
    "    dataset -- needed for negative sampling, unused here.\n",
    "\n",
    "    Return:\n",
    "    cost -- cross entropy cost for the softmax word prediction\n",
    "    gradPred -- the gradient with respect to the predicted word\n",
    "           vector\n",
    "    grad -- the gradient with respect to all the other word\n",
    "           vectors\n",
    "\n",
    "    We will not provide starter code for this function, but feel\n",
    "    free to reference the code you previously wrote for this\n",
    "    assignment!\n",
    "    \"\"\"\n",
    "\n",
    "    ### YOUR CODE HERE\n",
    "    # target是指公式中下标为o的那个，在skipgram\n",
    "    v_hat = predicted\n",
    "    \n",
    "    #注意到每行代表一个词向量\n",
    "    Pred = softmax(np.dot(outputVectors, v_hat))  \n",
    "    cost = -np.log(Pred[target])\n",
    "    \n",
    "    # \\hat{y} - y 的实现\n",
    "    Pred[target] -= 1.\n",
    "    # 关于V的梯度\n",
    "    gradPred = np.dot(outputVectors.T, Pred)\n",
    "    # 关于U的梯度,pred和v_hat都是向量，扩充为矩阵。\n",
    "    grad = np.outer(Pred, v_hat)\n",
    "    \n",
    "    ### END YOUR CODE\n",
    "        \n",
    "    return cost, gradPred, grad"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getNegativeSamples(target, dataset, K):\n",
    "    \"\"\" Samples K indexes which are not the target \"\"\"\n",
    "\n",
    "    indices = [None] * K\n",
    "    for k in range(K):\n",
    "        newidx = dataset.sampleTokenIdx()\n",
    "        while newidx == target:\n",
    "            newidx = dataset.sampleTokenIdx()\n",
    "        indices[k] = newidx\n",
    "    return indices"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# negSamplingCostAndGradient\n",
    "$\\begin{align}  \\frac{\\partial J}{\\partial v_c}&=\\left(\\sigma(u_o^Tv_c)-1\\right)u_o-\\sum_{k=1}^K\\left(\\sigma(-u_k^Tv_c)-1\\right)u_k\\\\  \\frac{\\partial J}{\\partial u_o}&=\\left(\\sigma(u_o^Tv_c)-1\\right)v_c\\\\  \\frac{\\partial J}{\\partial u_k}&=-\\left(\\sigma(-u_k^Tv_c)-1\\right)v_c\\\\  \\end{align}$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,\n",
    "                               K=10):\n",
    "    \"\"\" Negative sampling cost function for word2vec models\n",
    "\n",
    "    Implement the cost and gradients for one predicted word vector\n",
    "    and one target word vector as a building block for word2vec\n",
    "    models, using the negative sampling technique. K is the sample\n",
    "    size.\n",
    "\n",
    "    Note: See test_word2vec below for dataset's initialization.\n",
    "\n",
    "    Arguments/Return Specifications: same as softmaxCostAndGradient\n",
    "    \"\"\"\n",
    "\n",
    "    # Sampling of indices is done for you. Do not modify this if you\n",
    "    # wish to match the autograder and receive points!\n",
    "    indices = [target]\n",
    "    indices.extend(getNegativeSamples(target, dataset, K))\n",
    "\n",
    "    ### YOUR CODE HERE\n",
    "    grad = np.zeros(outputVectors.shape)\n",
    "    gradPred =np.zeros(predicted.shape)\n",
    "    cost = 0\n",
    "    \n",
    "    z = sigmoid(np.dot(outputVectors[target], predicted))\n",
    "    cost -= np.log(z)\n",
    "    grad[target] += predicted * (z - 1.0)\n",
    "    gradPred += outputVectors[target] * (z-1.0)\n",
    "    \n",
    "    for k in range(K):\n",
    "        sample = indices[k + 1]\n",
    "        z = sigmoid(np.dot(outputVectors[sample], predicted))\n",
    "        # sigmoid(x) = 1 - sigmoid(-x)\n",
    "        cost -= np.log(1.0 - z)\n",
    "        # sigmoid(-x) -1 = -sigmoid(x)\n",
    "        grad[sample] += predicted * z\n",
    "        gradPred += outputVectors[sample] * z\n",
    "        \n",
    "        \n",
    "    ### END YOUR CODE\n",
    "\n",
    "    return cost, gradPred, grad"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Skip-gram\n",
    "- 给定中间词，寻找周围的词。\n",
    "- 获得cost和梯度，更新词向量。即，把每一个词的梯度，都加起来。\n",
    "\n",
    "$\\begin{align}\t\\frac{J_{skip-gram}(word_{c-m \\dots c+m})}{\\partial \\boldsymbol{U}} &= \t\t\\sum\\limits_{-m \\leq j \\leq m, j \\ne 0} \\frac{\\partial F(\\boldsymbol{w}_{c+j}, \\boldsymbol{v}_{c})}{\\partial \\boldsymbol{U}} \\nonumber \\\\\t\\frac{J_{skip-gram}(word_{c-m \\dots c+m})}{\\partial \\boldsymbol{v}_{c}} &= \t\\sum\\limits_{-m \\leq j \\leq m, j \\ne 0} \\frac{\\partial F(\\boldsymbol{w}_{c+j}, \\boldsymbol{v}_{c})}{\\partial \\boldsymbol{v}_{c}} \\nonumber \\\\\t\\frac{J_{skip-gram}(word_{c-m \\dots c+m})}{\\partial \\boldsymbol{v}_{j}} &= 0, \\forall j\\ne c \\nonumber\\end{align}$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def skipgram(currentWord, C, contextWords, tokens, inputVectors, outputVectors,\n",
    "             dataset, word2vecCostAndGradient=softmaxCostAndGradient):\n",
    "    \"\"\" Skip-gram model in word2vec\n",
    "\n",
    "    Implement the skip-gram model in this function.\n",
    "\n",
    "    Arguments:\n",
    "    currrentWord -- a string of the current center word\n",
    "    C -- integer, context size\n",
    "    contextWords -- list of no more than 2*C strings, the context words\n",
    "    tokens -- a dictionary that maps words to their indices in\n",
    "              the word vector list\n",
    "    inputVectors -- \"input\" word vectors (as rows) for all tokens\n",
    "    outputVectors -- \"output\" word vectors (as rows) for all tokens\n",
    "    word2vecCostAndGradient -- the cost and gradient function for\n",
    "                               a prediction vector given the target\n",
    "                               word vectors, could be one of the two\n",
    "                               cost functions you implemented above.\n",
    "\n",
    "    Return:\n",
    "    cost -- the cost function value for the skip-gram model\n",
    "    grad -- the gradient with respect to the word vectors\n",
    "    \"\"\"\n",
    "\n",
    "    cost = 0.0\n",
    "    gradIn = np.zeros(inputVectors.shape)\n",
    "    gradOut = np.zeros(outputVectors.shape)\n",
    "\n",
    "    ### YOUR CODE HERE\n",
    "    cword_index = tokens[currentWord]\n",
    "    vhat = inputVectors[cword_index]\n",
    "    \n",
    "    for j in contextWords:\n",
    "        u_index = tokens[j] # target\n",
    "        c_cost, c_grad_in, c_grad_out = \\\n",
    "            word2vecCostAndGradient(vhat, u_index, outputVectors, dataset)\n",
    "        cost += c_cost\n",
    "        gradIn[cword_index] += c_grad_in\n",
    "        gradOut += c_grad_out    \n",
    "    ### END YOUR CODE\n",
    "\n",
    "    return cost, gradIn, gradOut"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CBOW\n",
    "- 给定周围的字，发现中间的字。\n",
    "- 策略：把周围字的词向量加起来（为什么不是平均？），得到推测的中间字向量，与当前的中间字向量做更新。\n",
    "\n",
    "\n",
    "$\\begin{align}\t\\frac{J_{CBOW}(word_{c-m \\dots c+m})}{\\partial \\boldsymbol{U}}& = \\frac{\\partial F(\\boldsymbol{w}_{c}, \\hat{\\boldsymbol{v}})}{\\partial \\boldsymbol{U}} \\nonumber \\\\ \\frac{J_{CBOW}(word_{c-m \\dots c+m})}{\\partial \\boldsymbol{v}_{j}} &= \t\\frac{\\partial F(\\boldsymbol{w}_{c}, \\hat{\\boldsymbol{v}})}{\\partial \\hat{\\boldsymbol{v}}}, \\forall (j \\ne c) \\in \\{c-m \\dots c+m\\} \\nonumber \\\\ \\frac{J_{CBOW}(word_{c-m \\dots c+m})}{\\partial \\boldsymbol{v}_{j}} &= 0, \\forall (j \\ne c) \\notin \\{c-m \\dots c+m\\} \\nonumber\\end{align}$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cbow(currentWord, C, contextWords, tokens, inputVectors, outputVectors,\n",
    "         dataset, word2vecCostAndGradient=softmaxCostAndGradient):\n",
    "    \"\"\"CBOW model in word2vec\n",
    "\n",
    "    Implement the continuous bag-of-words model in this function.\n",
    "\n",
    "    Arguments/Return specifications: same as the skip-gram model\n",
    "\n",
    "    Extra credit: Implementing CBOW is optional, but the gradient\n",
    "    derivations are not. If you decide not to implement CBOW, remove\n",
    "    the NotImplementedError.\n",
    "    \"\"\"\n",
    "\n",
    "    cost = 0.0\n",
    "    gradIn = np.zeros(inputVectors.shape)\n",
    "    gradOut = np.zeros(outputVectors.shape)\n",
    "\n",
    "    ### YOUR CODE HERE\n",
    "    predicted_indices = [tokens[word] for word in contextWords]\n",
    "    predicted_vectors = inputVectors[predicted_indices]\n",
    "    # 我记得笔记中提到的是做平均，这里待定。\n",
    "    predicted = np.sum(predicted_vectors, axis=0)\n",
    "    \n",
    "    target = tokens[currentWord]\n",
    "    cost,gradIn_predicted, gradOut = \\\n",
    "        word2vecCostAndGradient(predicted, target, outputVectors, dataset)\n",
    "    \n",
    "    #注意下面是加，而不是赋值，因为同一个样本重复出现,山下文中可能出现相同的词汇\n",
    "    for i in predicted_indices:\n",
    "        gradIn[i] += gradIn_predicted    \n",
    "    \n",
    "    ### END YOUR CODE\n",
    "\n",
    "    return cost, gradIn, gradOut"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Testing normalizeRows...\n",
      "[[0.6        0.8       ]\n",
      " [0.4472136  0.89442719]]\n",
      "\n",
      "==== Gradient check for skip-gram ====\n",
      "Gradient check passed!\n",
      "Gradient check passed!\n",
      "\n",
      "==== Gradient check for CBOW      ====\n",
      "Gradient check passed!\n",
      "Gradient check passed!\n",
      "\n",
      "=== Results ===\n",
      "(11.16610900153398, array([[ 0.        ,  0.        ,  0.        ],\n",
      "       [ 0.        ,  0.        ,  0.        ],\n",
      "       [-1.26947339, -1.36873189,  2.45158957],\n",
      "       [ 0.        ,  0.        ,  0.        ],\n",
      "       [ 0.        ,  0.        ,  0.        ]]), array([[-0.41045956,  0.18834851,  1.43272264],\n",
      "       [ 0.38202831, -0.17530219, -1.33348241],\n",
      "       [ 0.07009355, -0.03216399, -0.24466386],\n",
      "       [ 0.09472154, -0.04346509, -0.33062865],\n",
      "       [-0.13638384,  0.06258276,  0.47605228]]))\n",
      "(14.093692760899629, array([[ 0.        ,  0.        ,  0.        ],\n",
      "       [ 0.        ,  0.        ,  0.        ],\n",
      "       [-3.86802836, -1.12713967, -1.52668625],\n",
      "       [ 0.        ,  0.        ,  0.        ],\n",
      "       [ 0.        ,  0.        ,  0.        ]]), array([[-0.11265089,  0.05169237,  0.39321163],\n",
      "       [-0.22716495,  0.10423969,  0.79292674],\n",
      "       [-0.79674766,  0.36560539,  2.78107395],\n",
      "       [-0.31602611,  0.14501561,  1.10309954],\n",
      "       [-0.80620296,  0.36994417,  2.81407799]]))\n",
      "(0.7989958010906648, array([[ 0.23330542, -0.51643128, -0.8281311 ],\n",
      "       [ 0.11665271, -0.25821564, -0.41406555],\n",
      "       [ 0.11665271, -0.25821564, -0.41406555],\n",
      "       [ 0.        ,  0.        ,  0.        ],\n",
      "       [ 0.        ,  0.        ,  0.        ]]), array([[ 0.80954933,  0.21962514, -0.54095764],\n",
      "       [-0.03556575, -0.00964874,  0.02376577],\n",
      "       [-0.13016109, -0.0353118 ,  0.08697634],\n",
      "       [-0.1650812 , -0.04478539,  0.11031068],\n",
      "       [-0.47874129, -0.1298792 ,  0.31990485]]))\n",
      "(7.89559320359914, array([[-2.98873309, -3.38440688, -2.62676289],\n",
      "       [-1.49436655, -1.69220344, -1.31338145],\n",
      "       [-1.49436655, -1.69220344, -1.31338145],\n",
      "       [ 0.        ,  0.        ,  0.        ],\n",
      "       [ 0.        ,  0.        ,  0.        ]]), array([[ 0.21992784,  0.0596649 , -0.14696034],\n",
      "       [-1.37825047, -0.37390982,  0.92097553],\n",
      "       [-0.77702167, -0.21080061,  0.51922198],\n",
      "       [-2.58955401, -0.7025281 ,  1.73039366],\n",
      "       [-2.36749007, -0.64228369,  1.58200593]]))\n"
     ]
    }
   ],
   "source": [
    "#############################################\n",
    "# Testing functions below. DO NOT MODIFY!   #\n",
    "#############################################\n",
    "\n",
    "def word2vec_sgd_wrapper(word2vecModel, tokens, wordVectors, dataset, C,\n",
    "                         word2vecCostAndGradient=softmaxCostAndGradient):\n",
    "    batchsize = 50\n",
    "    cost = 0.0\n",
    "    grad = np.zeros(wordVectors.shape)\n",
    "    N = wordVectors.shape[0]\n",
    "    inputVectors = wordVectors[:int(N/2),:]\n",
    "    outputVectors = wordVectors[int(N/2):,:]\n",
    "    for i in range(batchsize):\n",
    "        C1 = random.randint(1,C)\n",
    "        centerword, context = dataset.getRandomContext(C1)\n",
    "\n",
    "        if word2vecModel == skipgram:\n",
    "            denom = 1\n",
    "        else:\n",
    "            denom = 1\n",
    "\n",
    "        c, gin, gout = word2vecModel(\n",
    "            centerword, C1, context, tokens, inputVectors, outputVectors,\n",
    "            dataset, word2vecCostAndGradient)\n",
    "        cost += c / batchsize / denom\n",
    "        grad[:int(N/2), :] += gin / batchsize / denom\n",
    "        grad[int(N/2):, :] += gout / batchsize / denom\n",
    "\n",
    "    return cost, grad\n",
    "def test_word2vec():\n",
    "    \"\"\" Interface to the dataset for negative sampling \"\"\"\n",
    "    dataset = type('dummy', (), {})()\n",
    "    def dummySampleTokenIdx():\n",
    "        return random.randint(0, 4)\n",
    "\n",
    "    def getRandomContext(C):\n",
    "        tokens = [\"a\", \"b\", \"c\", \"d\", \"e\"]\n",
    "        return tokens[random.randint(0,4)], \\\n",
    "            [tokens[random.randint(0,4)] for i in range(2*C)]\n",
    "    dataset.sampleTokenIdx = dummySampleTokenIdx\n",
    "    dataset.getRandomContext = getRandomContext\n",
    "\n",
    "    random.seed(31415)\n",
    "    np.random.seed(9265)\n",
    "    dummy_vectors = normalizeRows(np.random.randn(10,3))\n",
    "    dummy_tokens = dict([(\"a\",0), (\"b\",1), (\"c\",2),(\"d\",3),(\"e\",4)])\n",
    "    print (\"==== Gradient check for skip-gram ====\")\n",
    "    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(\n",
    "        skipgram, dummy_tokens, vec, dataset, 5, softmaxCostAndGradient),\n",
    "        dummy_vectors)\n",
    "    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(\n",
    "        skipgram, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient),\n",
    "        dummy_vectors)\n",
    "    print (\"\\n==== Gradient check for CBOW      ====\")\n",
    "    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(\n",
    "        cbow, dummy_tokens, vec, dataset, 5, softmaxCostAndGradient),\n",
    "        dummy_vectors)\n",
    "    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(\n",
    "        cbow, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient),\n",
    "        dummy_vectors)\n",
    "\n",
    "    print (\"\\n=== Results ===\")\n",
    "    print (skipgram(\"c\", 3, [\"a\", \"b\", \"e\", \"d\", \"b\", \"c\"],\n",
    "            dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset))\n",
    "    print (skipgram(\"c\", 1, [\"a\", \"b\"],\n",
    "            dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset,\n",
    "            negSamplingCostAndGradient))\n",
    "    print (cbow(\"a\", 2, [\"a\", \"b\", \"c\", \"a\"],\n",
    "            dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset))\n",
    "    print (cbow(\"a\", 2, [\"a\", \"b\", \"a\", \"c\"],\n",
    "            dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset,\n",
    "            negSamplingCostAndGradient))\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    test_normalize_rows()\n",
    "    test_word2vec()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}