{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Swinging Up the Pendulum Using DDPG"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Goal:\n",
    "\n",
    "\n",
    "Say, we have a pendulum which starts in a random position and the goal of our agent is  to swing the pendulum up so that it stays upright."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First, let us import necessary libraries\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import gym"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Now let us define the hyperparameters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# number of steps in each episode\n",
    "epsiode_steps = 500 \n",
    "\n",
    "# learning rate for actor\n",
    "lr_a = 0.001    \n",
    "\n",
    "# learning rate for critic\n",
    "lr_c = 0.002 \n",
    "\n",
    "# discount factor\n",
    "gamma = 0.9 \n",
    "\n",
    "# soft replacement\n",
    "alpha = 0.01 \n",
    "\n",
    "# replay buffer size\n",
    "memory = 10000 \n",
    "\n",
    "# batch size for training\n",
    "batch_size = 32     \n",
    "render = False"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Next, we define the class DDPG which implements the DDPG algorithm. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class DDPG(object):\n",
    "    def __init__(self, no_of_actions, no_of_states, a_bound,):\n",
    "        \n",
    "        # initialize the memory with shape as no of actions, no of states and our defined memory size\n",
    "        self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)\n",
    "        \n",
    "        # initialize pointer to point to our experience buffer\n",
    "        self.pointer = 0\n",
    "        \n",
    "        # initialize tensorflow session\n",
    "        self.sess = tf.Session()\n",
    "        \n",
    "        # initialize the variance for OU process for exploring policies\n",
    "        self.noise_variance = 3.0\n",
    "        \n",
    "        self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,\n",
    "        \n",
    "        # placeholder for current state, next state and rewards\n",
    "        self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')\n",
    "        self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')\n",
    "        self.reward = tf.placeholder(tf.float32, [None, 1], 'r')\n",
    "        \n",
    "        # build the actor network which has separate eval(primary) and target network\n",
    "        with tf.variable_scope('Actor'):\n",
    "            self.a = self.build_actor_network(self.state, scope='eval', trainable=True)\n",
    "            a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)\n",
    "            \n",
    "        # build the critic network which has separate eval(primary) and target network    \n",
    "        with tf.variable_scope('Critic'):\n",
    "            q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)\n",
    "            q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)\n",
    "            \n",
    "\n",
    "        # initialize the network parameters\n",
    "        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')\n",
    "        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')\n",
    "        \n",
    "        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')\n",
    "        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')\n",
    "\n",
    "        # update target value\n",
    "        self.soft_replace = [[tf.assign(at, (1-alpha)*at+alpha*ae), tf.assign(ct, (1-alpha)*ct+alpha*ce)]\n",
    "            for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]\n",
    "        \n",
    "        \n",
    "        # compute target Q value, we know that Q(s,a) = reward + gamma * Q'(s',a')\n",
    "        q_target = self.reward + gamma * q_\n",
    "        \n",
    "    \n",
    "        # compute TD error i.e  actual - predicted values\n",
    "        td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)\n",
    "        \n",
    "        # train the critic network with adam optimizer\n",
    "        self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name=\"adam-ink\", var_list = self.ce_params)\n",
    "        \n",
    "        # compute the loss in actor network\n",
    "        a_loss = - tf.reduce_mean(q)    \n",
    "        \n",
    "        # train the actor network with adam optimizer for minimizing the loss\n",
    "        self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)\n",
    "\n",
    "        # initialize summary writer to visualize our network in tensorboard\n",
    "        tf.summary.FileWriter(\"logs\", self.sess.graph)\n",
    "        \n",
    "        # initialize all variables\n",
    "        self.sess.run(tf.global_variables_initializer())\n",
    "\n",
    "       \n",
    "\n",
    "    # How do we select acion in DDPG? We select action by adding noise to the action space. We use\n",
    "    # Ornstein-Uhlenbeck random process for generating noise\n",
    "\n",
    "    def choose_action(self, s):\n",
    "        a = self.sess.run(self.a, {self.state: s[np.newaxis, :]})[0]\n",
    "        a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)\n",
    "        \n",
    "        return a\n",
    "    \n",
    "    \n",
    "    # then we define the function called learn where the actual training happens,\n",
    "    # here we select a minibatch of states, actions, rewards and next state from the experience buffer\n",
    "    # and we train actor and critic network\n",
    "\n",
    "    def learn(self):\n",
    "        \n",
    "        # soft target replacement\n",
    "        self.sess.run(self.soft_replace)\n",
    "\n",
    "        indices = np.random.choice(memory, size=batch_size)\n",
    "        batch_transition = self.memory[indices, :]\n",
    "        batch_states = batch_transition[:, :self.no_of_states]\n",
    "        batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]\n",
    "        batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]\n",
    "        batch_next_state = batch_transition[:, -self.no_of_states:]\n",
    "\n",
    "        self.sess.run(self.atrain, {self.state: batch_states})\n",
    "        self.sess.run(self.ctrain, {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards, self.next_state: batch_next_state})\n",
    "\n",
    "    # we define a function store_transition which stores all the transition information in the buffer\n",
    "    def store_transition(self, s, a, r, s_):\n",
    "        trans = np.hstack((s,a,[r],s_))\n",
    "        \n",
    "        index = self.pointer % memory\n",
    "        self.memory[index, :] = trans\n",
    "        self.pointer += 1\n",
    "\n",
    "        if self.pointer > memory:\n",
    "            self.noise_variance *= 0.99995\n",
    "            self.learn()\n",
    "            \n",
    "            \n",
    "     # we define the function build_actor_network for builing our actor network       \n",
    "    def build_actor_network(self, s, scope, trainable):\n",
    "        # Actor DPG\n",
    "        with tf.variable_scope(scope):\n",
    "            l1 = tf.layers.dense(s, 30, activation = tf.nn.tanh, name = 'l1', trainable = trainable)\n",
    "            a = tf.layers.dense(l1, self.no_of_actions, activation = tf.nn.tanh, name = 'a', trainable = trainable)     \n",
    "            return tf.multiply(a, self.a_bound, name = \"scaled_a\")  \n",
    "\n",
    "\n",
    "    # followed by we define the function build_crtic_network which build our critic network\n",
    "    def build_crtic_network(self, s, a, scope, trainable):\n",
    "        with tf.variable_scope(scope):\n",
    "            n_l1 = 30\n",
    "            w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable = trainable)\n",
    "            w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable = trainable)\n",
    "            b1 = tf.get_variable('b1', [1, n_l1], trainable = trainable)\n",
    "            net = tf.nn.tanh( tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1 )\n",
    "\n",
    "            q = tf.layers.dense(net, 1, trainable = trainable)\n",
    "            return q\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, we see how to apply DDPG for swinging up the pendulum"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " First, we initialize our gym environment using make function    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2018-06-11 15:50:38,150] Making new env: Pendulum-v0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[1]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "env = gym.make(\"Pendulum-v0\")\n",
    "env = env.unwrapped\n",
    "env.seed(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Get the number od states and actions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "no_of_states = env.observation_space.shape[0]\n",
    "no_of_actions = env.action_space.shape[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As it is a continous action space get the higher bound of the action"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "a_bound = env.action_space.high"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, we create an object to our DDPG class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ddpg = DDPG(no_of_actions, no_of_states, a_bound)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# for storing the total rewards\n",
    "total_reward = []\n",
    "\n",
    "# set the number of episodes\n",
    "no_of_episodes = 300\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# for each episodes \n",
    "for i in range(no_of_episodes):\n",
    "    # initialize the environment\n",
    "    s = env.reset()\n",
    "    \n",
    "    # episodic reward\n",
    "    ep_reward = 0\n",
    "    \n",
    "    for j in range(epsiode_steps):\n",
    "        \n",
    "        env.render()\n",
    "\n",
    "        # select action by adding noise through OU process\n",
    "        a = ddpg.choose_action(s)\n",
    "        \n",
    "        # peform the action and move to the next state s\n",
    "        s_, r, done, info = env.step(a)\n",
    "        \n",
    "        # store the the transition to our experience buffer \n",
    "        # sample some minibatch of experience and train the network\n",
    "        ddpg.store_transition(s, a, r, s_)\n",
    "      \n",
    "        # update current state as next state\n",
    "        s = s_\n",
    "        \n",
    "        # add episodic rewards\n",
    "        ep_reward += r\n",
    "        \n",
    "        if j == epsiode_steps-1:\n",
    "            \n",
    "            # store the total rewards\n",
    "            total_reward.append(ep_reward)\n",
    "            \n",
    "            # print rewards obtained per each episode\n",
    "            print('Episode:', i, ' Reward: %i' % int(ep_reward))\n",
    "   \n",
    "            break\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Credits for the code used in this section  goes to <a href='https://github.com/wangshuailong/reinforcement_learning_with_Tensorflow/tree/master/DDPG'>wshuail </a>"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:universe]",
   "language": "python",
   "name": "conda-env-universe-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}