\n",
"
\n",
"\\begin{align}\n",
"f_{1,\\theta} &: \\mathbb{R}^5 \\to \\mathbb{R}^3\\\\\n",
"f_{2,\\theta} &: \\mathbb{R}^3 \\to \\mathbb{R}^3\\\\\n",
"f_{3,\\theta} &: \\mathbb{R}^3 \\to \\mathbb{R}^1\\\\\n",
"g_\\theta &= f_{3,\\theta} \\circ f_{2,\\theta} \\circ f_{1,\\theta}\\\\\n",
"g_\\theta(\\mathbf{x}) &= f_{3,\\theta}(f_{2,\\theta}(f_{1,\\theta}(\\mathbf{x})))\\\\\n",
"g_\\theta &: \\mathbb{R}^5 \\to \\mathbb{R}^1\n",
"\\end{align}\n",
"
\n",
"
"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Calculation of Gradients\n",
"
\n",
"
\n",
"\\begin{align}\n",
"g_\\theta(\\mathbf{x}) &= \\text{sigmoid}(\\mathbf{W}^{1\\times 3}_3\\text{sigmoid}(\\mathbf{W}^{3\\times 3}_2\\text{sigmoid}(\\mathbf{W}^{3\\times 5}_1\\mathbf{x}+\\mathbf{b}_1)+\\mathbf{b}_2)+\\mathbf{b}_3)\\\\\n",
"\\frac{\\partial \\mathcal{L}(f_\\theta, \\mathbf{x}, \\mathbf{y})}{\\partial \\mathbf{W}^{1\\times 3}_3} &= \\text{ ?}\\\\\n",
"\\frac{\\partial \\mathcal{L}(f_\\theta, \\mathbf{x}, \\mathbf{y})}{\\partial \\mathbf{b}_3} &= \\text{ ?}\\\\\n",
"\\frac{\\partial \\mathcal{L}(f_\\theta, \\mathbf{x}, \\mathbf{y})}{\\partial \\mathbf{W}^{3\\times 3}_2} &= \\text{ ?}\\\\\n",
"\\frac{\\partial \\mathcal{L}(f_\\theta, \\mathbf{x}, \\mathbf{y})}{\\partial \\mathbf{b}_2} &= \\text{ ?}\\\\\n",
"\\frac{\\partial \\mathcal{L}(f_\\theta, \\mathbf{x}, \\mathbf{y})}{\\partial \\mathbf{W}^{3\\times 5}_1} &= \\text{ ?}\\\\\n",
"\\frac{\\partial \\mathcal{L}(f_\\theta, \\mathbf{x}, \\mathbf{y})}{\\partial \\mathbf{b}_1} &= \\text{ ?}\n",
"\\end{align}\n",
"
"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Chain Rule\n",
"\n",
"\\begin{align}\n",
"\\frac{\\partial f \\circ g}{\\partial \\theta} &= \\frac{\\partial f \\circ g}{\\partial g} \\frac{\\partial g}{\\partial \\theta}\\\\\n",
"\\end{align}"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Example"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"
\n",
"\\begin{align}\n",
"\\frac{\\partial \\mathcal{L}(\\text{sigmoid}(\\mathbf{W}\\mathbf{x}),\\mathbf{y})}{\\partial \\mathbf{W}} &= \\frac{\\partial \\mathcal{L}(\\text{sigmoid}(\\mathbf{W}\\mathbf{x}),\\mathbf{y})}{\\partial \\text{ sigmoid}(\\mathbf{W}\\mathbf{x})} \\frac{\\partial \\text{ sigmoid}(\\mathbf{W}\\mathbf{x})}{\\partial \\mathbf{Wx}} \\frac{\\partial{\\mathbf{Wx}}}{\\partial\\mathbf{W}}\n",
"\\end{align}\n",
"
"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"source": [
"\\begin{align}\n",
"\\mathbf{h} &= \\mathbf{W}\\mathbf{x}\\\\\n",
"\\mathbf{z} &= \\text{sigmoid}(\\mathbf{h})\\\\\n",
"\\mathcal{L}(\\mathbf{z},\\mathbf{y}) &= \\frac{1}{2}||\\mathbf{z} - \\mathbf{y}||^2\n",
"\\end{align}"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"source": [
"\\begin{align}\n",
"\\frac{\\mathcal{\\partial \\frac{1}{2}||\\mathbf{z} - \\mathbf{y}||^2}}{\\partial \\mathbf{W}} &= \\frac{\\partial \\frac{1}{2}||\\mathbf{z} - \\mathbf{y}||^2}{\\partial\\mathbf{z}} \\frac{\\partial\\mathbf{z}}{\\partial \\mathbf{h}} \\frac{\\partial \\mathbf{h}}{\\partial \\mathbf{W}}\n",
"\\end{align}"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Example cont."
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"source": [
"\\begin{align}\n",
"\\mathbf{h} &= \\mathbf{W}\\mathbf{x}\\\\\n",
"\\mathbf{z} &= \\text{sigmoid}(\\mathbf{h})\\\\\n",
"\\mathcal{L}(\\mathbf{z},\\mathbf{y}) &= \\frac{1}{2}||\\mathbf{z} - \\mathbf{y}||^2\n",
"\\end{align}"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"source": [
"\\begin{align}\n",
"\\frac{\\mathcal{\\partial \\frac{1}{2}||\\mathbf{z} - \\mathbf{y}||^2}}{\\partial \\mathbf{W}} &= \\frac{\\partial \\frac{1}{2}||\\mathbf{z} - \\mathbf{y}||^2}{\\partial\\mathbf{z}} \\frac{\\partial\\mathbf{z}}{\\partial \\mathbf{h}} \\frac{\\partial \\mathbf{h}}{\\partial \\mathbf{W}}\\\\\n",
"\\partial \\mathbf{z} &= \\mathbf{z}-\\mathbf{y}\\\\\n",
"\\partial \\mathbf{h} &= \\partial \\mathbf{z}\\,\\text{sigmoid}(\\mathbf{h})\\,(1 - \\text{sigmoid}(\\mathbf{h}))\\\\\n",
"\\partial \\mathbf{W} &= \\partial\\mathbf{h}\\otimes\\mathbf{x}\n",
"\\end{align}"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Module"
]
},
{
"cell_type": "markdown",
"metadata": {
"hide_input": false
},
"source": [
"
"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Backpropagation"
]
},
{
"cell_type": "markdown",
"metadata": {
"hide_input": false
},
"source": [
"
"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Deep Learning Libraries\n",
"- pytorch\n",
"- dynet\n",
"- Theano\n",
"- DeepLearning4J\n",
"- autograd\n",
"- **TensorFlow**\n",
"- ..."
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"
"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Logistic Regression"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"seed = 0\n",
"#input\n",
"input_sz = 3\n",
"output_sz = 1\n",
"x = tf.placeholder(\"float\")\n",
"#parameters\n",
"W = tf.Variable(tf.random_uniform([output_sz,input_sz], -0.1, 0.1, seed=seed))\n",
"b = tf.Variable(tf.zeros(output_sz))\n",
"#f_theta\n",
"z = tf.nn.sigmoid(tf.matmul(W,x) + b) #sigmoid(Wx + b)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.07982747, 0.09403337, 0.06975283]], dtype=float32)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sess = tf.Session()\n",
"sess.run(tf.global_variables_initializer()) #initialize W and b\n",
"sess.run(W)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.], dtype=float32)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sess.run(b)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Logistic Regression cont."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Forward: $\\mathbf{z} = f_\\theta(\\mathbf{x})$"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.64387923]], dtype=float32)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sess.run(z, feed_dict={x: [[-5.5],[2.0],[-0.5]]})"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"source": [
"Backward: $\\partial\\mathbf{W},\\partial\\mathbf{b},\\partial\\mathbf{x}$ given upstream gradient $\\partial\\mathbf{z}$"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[array([[-0.13708647, 0.04984963, -0.01246241]], dtype=float32),\n",
" array([ 0.02492481], dtype=float32),\n",
" array([[ 0.00034354],\n",
" [-0.00093437],\n",
" [-0.00204336]], dtype=float32)]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sess.run(tf.global_variables_initializer())\n",
"gradz = [[0.1]] \n",
"grad = tf.gradients(z,[W, b, x], grad_ys=gradz)\n",
"sess.run(grad, feed_dict={x: [[-5.5],[2.0],[-0.5]]})"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Multi-layer Perceptron"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 1.35592151]], dtype=float32)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#input\n",
"x = tf.placeholder(tf.float32, shape=[5,1])\n",
"#parameters\n",
"W1 = tf.Variable(tf.random_uniform([3,5], seed=seed))\n",
"b1 = tf.Variable(tf.zeros([3,1]))\n",
"W2 = tf.Variable(tf.random_uniform([3,3], seed=seed))\n",
"b2 = tf.Variable(tf.zeros([3,1]))\n",
"W3 = tf.Variable(tf.random_uniform([1,3], seed=seed))\n",
"b3 = tf.Variable(tf.zeros([1,1]))\n",
"#model\n",
"h1 = tf.nn.sigmoid(tf.matmul(W1,x) + b1) \n",
"h2 = tf.nn.sigmoid(tf.matmul(W2,h1) + b2)\n",
"mlp_z = tf.matmul(W3,h2) + b3 \n",
"\n",
"sess.run(tf.global_variables_initializer())\n",
"x_value = [[-5.5], [2.0], [-0.5], [2.0], [4.0]]\n",
"sess.run(mlp_z, feed_dict={x: x_value})"
]
},
{
"cell_type": "markdown",
"metadata": {
"hideOutput": true,
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Training"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0.07483862]]\n",
"[[ 0.00129254]]\n",
"[[ 7.06945139e-06]]\n",
"[[ 3.01882075e-08]]\n",
"[[ 1.25567112e-10]]\n"
]
}
],
"source": [
"target_z = tf.constant([[1.0]]) # what the output should be\n",
"loss = tf.square(target_z - mlp_z) # the loss function \n",
"optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)\n",
"opt_op = optimizer.minimize(loss) # the TF operation that performs optimisation steps\n",
"sess.run(tf.global_variables_initializer())\n",
"for epoch in range(0,5):\n",
" _, loss_value = sess.run([opt_op, loss], feed_dict={x: x_value})\n",
" if epoch % 1 == 0:\n",
" print(loss_value)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"It learned!"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.99999923]], dtype=float32)"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sess.run(mlp_z, feed_dict={x: x_value})"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Next\n",
"\n",
"Input are always (continuous) **vectors**. \n",
"\n",
"What vectors to use in NLP? "
]
}
],
"metadata": {
"celltoolbar": "Slideshow",
"hide_input": false,
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
},
"latex_envs": {
"bibliofile": "biblio.bib",
"cite_by": "apalike",
"current_citInitial": 1,
"eqLabelWithNumbers": true,
"eqNumInitial": 0
},
"livereveal": {
"theme": "white",
"transition": "concave"
}
},
"nbformat": 4,
"nbformat_minor": 1
}