{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "importing Jupyter notebook from q1_softmax.ipynb\n",
      "importing Jupyter notebook from q2_sigmoid.ipynb\n",
      "importing Jupyter notebook from q2_gradcheck.ipynb\n"
     ]
    }
   ],
   "source": [
    "import import_ipynb\n",
    "from q1_softmax import softmax\n",
    "from q2_sigmoid import sigmoid, sigmoid_grad\n",
    "from q2_gradcheck import gradcheck_naive"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# forword\n",
    "![](https://raw.githubusercontent.com/mmmwhy/picture/master/picgo/20190502101618.png)\n",
    "- $h = sigmoid(xW_1+b_1)$, \n",
    "- $\\hat{y} = softmax(hW_2+b_2)$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# backward\n",
    "\n",
    "\n",
    "- $\\delta_1 = \\frac{\\partial{CE}}{\\partial{z_2}} = \\hat{y} - y$ \n",
    "- $\\begin{align} \\delta_2 = \\frac{\\partial{CE}}{\\partial{h}} = \\frac{\\partial{CE}}{\\partial{z_2}} \\frac{\\partial{z_2}}{\\partial{h}} = \\delta_1W_2^T \\end{align}$\n",
    "- $\\begin{align}\\delta_3 = \\frac{\\partial{CE}}{z_1} = \\frac{\\partial{CE}}{\\partial{h}}\\frac{\\partial{h}}{\\partial{z_1}} = \\delta_2 \\frac{\\partial{h}}{\\partial{z_1}}= \\delta_2 \\circ \\sigma'(z_1)\\end{align}$\n",
    "\n",
    "- $\\frac{\\partial{CE}}{\\partial{x}}=\\delta_3\\frac{\\partial{z_1}}{\\partial{x}} = \\delta_3W_1^T $"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def forward_backward_prop(data, labels, params, dimensions):\n",
    "    \"\"\"\n",
    "    Forward and backward propagation for a two-layer sigmoidal network\n",
    "    Compute the forward propagation and for the cross entropy cost,\n",
    "    and backward propagation for the gradients for all parameters.\n",
    "    Arguments:\n",
    "    data -- M x Dx matrix, where each row is a training example.\n",
    "    labels -- M x Dy matrix, where each row is a one-hot vector.\n",
    "    params -- Model parameters, these are unpacked for you.\n",
    "    dimensions -- A tuple of input dimension, number of hidden units\n",
    "                  and output dimension\n",
    "    \"\"\"\n",
    "    ### Unpack network parameters (do not modify)\n",
    "    ofs = 0\n",
    "    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])\n",
    "    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))\n",
    "    ofs += Dx * H\n",
    "    b1 = np.reshape(params[ofs:ofs + H], (1, H))\n",
    "    ofs += H\n",
    "    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))\n",
    "    ofs += H * Dy\n",
    "    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))\n",
    "    ### YOUR CODE HERE: forward propagation\n",
    "    h = sigmoid(np.dot(data,W1) + b1)\n",
    "    yhat = softmax(np.dot(h,W2) + b2)\n",
    "    ### END YOUR CODE\n",
    "    ### YOUR CODE HERE: backward propagation\n",
    "    cost = np.sum(-np.log(yhat[labels==1])) \n",
    "    \n",
    "    d1 = (yhat - labels)\n",
    "    gradW2 = np.dot(h.T, d1)\n",
    "    gradb2 = np.sum(d1,0,keepdims=True)\n",
    "    \n",
    "    d2 = np.dot(d1,W2.T)\n",
    "    # h = sigmoid(z_1)\n",
    "    d3 = sigmoid_grad(h) * d2\n",
    "    gradW1 = np.dot(data.T,d3)\n",
    "    gradb1 = np.sum(d3,0)\n",
    "    \n",
    "    ### END YOUR CODE\n",
    "    ### Stack gradients (do not modify)\n",
    "    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),\n",
    "        gradW2.flatten(), gradb2.flatten()))\n",
    "    return cost, grad"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running sanity check...\n",
      "Gradient check passed!\n"
     ]
    }
   ],
   "source": [
    "def sanity_check():\n",
    "    \"\"\"\n",
    "    Set up fake data and parameters for the neural network, and test using\n",
    "    gradcheck.\n",
    "    \"\"\"\n",
    "    print(\"Running sanity check...\")\n",
    "\n",
    "    N = 20\n",
    "    dimensions = [10, 5, 10]\n",
    "    data = np.random.randn(N, dimensions[0])   # each row will be a datum\n",
    "    labels = np.zeros((N, dimensions[2]))\n",
    "    for i in range(N):\n",
    "        labels[i, random.randint(0,dimensions[2]-1)] = 1\n",
    "\n",
    "    params = np.random.randn((dimensions[0] + 1) * dimensions[1] + (\n",
    "        dimensions[1] + 1) * dimensions[2], )\n",
    "\n",
    "    gradcheck_naive(lambda params:\n",
    "        forward_backward_prop(data, labels, params, dimensions), params)\n",
    "    \n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    sanity_check()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}