{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RLN keras tutorial"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "This is a quick tutorial of the use of the Keras RLN implementation.<br>First, let's import and create the train and test set. In this tutorial, we're using the Boston housing price regression dataset, with additional noise features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from keras.wrappers.scikit_learn import KerasRegressor\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from keras import regularizers\n",
    "from Keras_implementation import RLNCallback\n",
    "from keras.datasets import boston_housing\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "from numpy.random import seed\n",
    "from keras.backend import eval as keras_eval\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "(x_train, y_train), (x_test, y_test) = boston_housing.load_data()\n",
    "\n",
    "# Add noisy features\n",
    "noise_features = 1000\n",
    "x_train = np.concatenate([x_train, np.random.normal(size=(x_train.shape[0], noise_features))], axis=1)\n",
    "x_test = np.concatenate([x_test, np.random.normal(size=(x_test.shape[0], noise_features))], axis=1)\n",
    "\n",
    "# Scale features\n",
    "scaler = StandardScaler()\n",
    "x_train = scaler.fit_transform(x_train, y_train)\n",
    "x_test = scaler.transform(x_test, y_test)\n",
    "\n",
    "INPUT_DIM = x_train.shape[1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's create a basic Keras model and a test function:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def base_model(layers=4, l1=0):\n",
    "    assert layers > 1\n",
    "\n",
    "    def build_fn():\n",
    "        inner_l1 = l1\n",
    "        # create model\n",
    "        model = Sequential()\n",
    "        # Construct the layers of the model to form a geometric series         \n",
    "        prev_width = INPUT_DIM\n",
    "        for width in np.exp(np.log(INPUT_DIM) * np.arange(layers - 1, 0, -1) / layers):\n",
    "            width = int(np.round(width))\n",
    "            model.add(Dense(width, input_dim=prev_width, kernel_initializer='glorot_normal', activation='relu', \n",
    "                            kernel_regularizer=regularizers.l1(inner_l1)))\n",
    "            # For efficiency we only regularized the first layer            \n",
    "            inner_l1 = 0\n",
    "            prev_width = width\n",
    "            \n",
    "        model.add(Dense(1, kernel_initializer='glorot_normal'))\n",
    "        \n",
    "        # Compile model\n",
    "        model.compile(loss='mean_squared_error', optimizer='rmsprop')\n",
    "        return model\n",
    "    return build_fn\n",
    "\n",
    "MJTCP = 32292 # Michael Jordan total career points\n",
    "\n",
    "def test_model(build_fn, modle_name, num_repeates=10):\n",
    "    seed(MJTCP)\n",
    "    results = np.zeros(num_repeates)\n",
    "    for i in range(num_repeates):\n",
    "        reg = KerasRegressor(build_fn=build_fn, epochs=100, batch_size=10, verbose=0)\n",
    "        reg.fit(x_train, y_train)\n",
    "        results[i] = reg.score(x_test, y_test)\n",
    "    print(\"%s: %.2f (%.2f) MSE\" % (modle_name, results.mean(), results.std()))\n",
    "    return results.mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's optimize the depth and L1 regularization of the network:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Network with 2 layers: 136.02 (5.17) MSE\n",
      "Network with 3 layers: 112.52 (5.58) MSE\n",
      "Network with 4 layers: 106.84 (10.62) MSE\n",
      "Network with 5 layers: 135.84 (103.32) MSE\n",
      "The best results of an unregularized network are achieved with depth 4\n"
     ]
    }
   ],
   "source": [
    "layers = 1\n",
    "\n",
    "prev_score = np.inf\n",
    "cur_score = None\n",
    "\n",
    "while (cur_score < prev_score) or (prev_score is None):\n",
    "    prev_score = cur_score\n",
    "    layers += 1\n",
    "    cur_score = test_model(base_model(layers=layers), \"Network with %d layers\" % layers)\n",
    "    \n",
    "layers -= 1\n",
    "print \"The best results of an unregularized network are achieved with depth %d\" %layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "L1 regularization of 1E-02: 53.05 (6.07) MSE\n",
      "L1 regularization of 1E-01: 48.57 (4.31) MSE\n",
      "L1 regularization of 1E+00: 144.00 (9.74) MSE\n",
      "The best L1 regularization is achieved with l1 = 1E-01\n"
     ]
    }
   ],
   "source": [
    "l1 = 0.001\n",
    "\n",
    "prev_score = np.inf\n",
    "cur_score = None\n",
    "\n",
    "while cur_score < prev_score or prev_score is None:\n",
    "    prev_score = cur_score\n",
    "    l1 *= 10\n",
    "    cur_score = test_model(base_model(layers=layers, l1=l1), \"L1 regularization of %.0E\" % l1)\n",
    "\n",
    "best_l1_score = prev_score\n",
    "\n",
    "l1 /= 10\n",
    "print \"The best L1 regularization is achieved with l1 = %.0E\" % l1 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The Keras RLN implementation uses callbacks to change the weights of the layer. The callback gets the specific layer it regularizes as a parameter, and is passed to the fit function:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def RLN(layers=4, **rln_kwargs):\n",
    "    def build_fn():\n",
    "        model = base_model(layers=layers)()\n",
    "        \n",
    "        # For efficiency we only regularized the first layer\n",
    "        rln_callback = RLNCallback(model.layers[0], **rln_kwargs)\n",
    "\n",
    "        # Change the fit function of the model to except rln_callback:\n",
    "        orig_fit = model.fit\n",
    "        def rln_fit(*args, **fit_kwargs):\n",
    "            orig_callbacks = fit_kwargs.get('callbacks', [])\n",
    "            rln_callbacks = orig_callbacks + [rln_callback]\n",
    "            return orig_fit(*args, callbacks=rln_callbacks, **fit_kwargs)\n",
    "\n",
    "        model.fit = rln_fit\n",
    "\n",
    "        return model\n",
    "\n",
    "    return build_fn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Applying RLN on this dataset to find the optimal average regularization (Theta) and learning rate (nu).\n",
    "* The average regularization (Theta) is in the log scale, while the regularization of Keras (l1) is not.\n",
    "* RLNs tend to require much smaller average regularization, and typically we have that exp(Theta) << l1\n",
    "* The learning rate (nu) is a very important parameter that can have dramatic effects on the performance of the network. It is very important to tune it well.\n",
    "* Because we optimize very small coefficients in the log scale, the gradients tend to be quite small, so a large learning rate is required."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RLN with Theta=-8 and learning_rate=1.0E+06: 38.91 (2.31) MSE\n",
      "RLN with Theta=-10 and learning_rate=1.0E+05: 34.60 (4.27) MSE\n",
      "RLN with Theta=-10 and learning_rate=1.0E+06: 21.34 (2.37) MSE\n",
      "RLN with Theta=-10 and learning_rate=1.0E+07: 99.58 (8.51) MSE\n",
      "RLN with Theta=-12 and learning_rate=1.0E+06: 86.83 (4.99) MSE\n",
      "The best RLN is achieved with Theta=-10 and learning_rate=1.0E+06\n"
     ]
    }
   ],
   "source": [
    "best_rln_score = np.inf\n",
    "Theta, learning_rate = None, None\n",
    "\n",
    "for cur_Theta, log_learning_rate in [(-8, 6), (-10, 5), (-10, 6), (-10, 7), (-12, 6)]:\n",
    "    cur_learning_rate = np.power(10, log_learning_rate)\n",
    "    cur_score = test_model(RLN(layers=layers, norm=1, avg_reg=cur_Theta, learning_rate=cur_learning_rate), \n",
    "                           \"RLN with Theta=%s and learning_rate=%.1E\" % (cur_Theta, cur_learning_rate))\n",
    "    if cur_score < best_rln_score:\n",
    "        Theta, learning_rate = cur_Theta, cur_learning_rate\n",
    "        best_rln_score = cur_score\n",
    "\n",
    "print \"The best RLN is achieved with Theta=%d and learning_rate=%.1E\" % (Theta, learning_rate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "We see that RLN outperforms L1 regularization on this dataset, 21.34 < 48.57\n",
      "We also see that the average regularization required in RLN is much smaller than required in L1 regularized models:\n",
      "4.5E-05 << 1.0E-01\n"
     ]
    }
   ],
   "source": [
    "print \"We see that RLN outperforms L1 regularization on this dataset, %.2f < %.2f\" %(best_rln_score, best_l1_score)\n",
    "print \"We also see that the average regularization required in RLN is much smaller than required in L1 regularized models:\"\n",
    "print \"%.1E << %.1E\" % (np.exp(Theta), l1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}