{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Drive up the Mountain Using A3C"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "Let us understand A3C by MountainCar example. Our agent is the car and it is placed\n",
    "between two mountains. The goal of our agent is to drive up the mountain on the right.\n",
    "Although, the car can't drive up the mountain on its first pass it has to drive up back and forth\n",
    "to build the momentum. A high reward will be assigned if our agent spends less energy on\n",
    "driving up."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    " Import required libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import gym\n",
    "import multiprocessing\n",
    "import threading\n",
    "import numpy as np\n",
    "import os\n",
    "import shutil\n",
    "import matplotlib.pyplot as plt\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Initialize necessary variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# number of worker agents\n",
    "no_of_workers = multiprocessing.cpu_count() \n",
    "\n",
    "# maximum number of steps per episode\n",
    "no_of_ep_steps = 200 \n",
    "\n",
    "# total number of episodes\n",
    "no_of_episodes = 2000 \n",
    "\n",
    "global_net_scope = 'Global_Net'\n",
    "\n",
    "# sets how often the global network should be updated\n",
    "update_global = 10\n",
    "\n",
    "# discount factor\n",
    "gamma = 0.90 \n",
    "\n",
    "# entropy factor\n",
    "entropy_beta = 0.01 \n",
    "\n",
    "# learning rate for actor\n",
    "lr_a = 0.0001 \n",
    "\n",
    "# learning rate for critic\n",
    "lr_c = 0.001 \n",
    "\n",
    "# boolean for rendering the environment\n",
    "render=False \n",
    "\n",
    "# directory for storing logs\n",
    "log_dir = 'logs'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we initialize our Mountain car environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2018-06-11 16:04:50,197] Making new env: MountainCarContinuous-v0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([-0.42727224,  0.        ])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "env = gym.make('MountainCarContinuous-v0')\n",
    "env.reset()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# we get the number of states, actions and also the action bound\n",
    "no_of_states = env.observation_space.shape[0]\n",
    "no_of_actions = env.action_space.shape[0]\n",
    "action_bound = [env.action_space.low, env.action_space.high]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "\n",
    "\n",
    "We will define our Actor critic network in a class called ActorCritic. Check the comments added on each line of code."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class ActorCritic(object):\n",
    "     def __init__(self, scope, sess, globalAC=None):\n",
    "         \n",
    "        # first we initialize the session and RMS prop optimizer for both\n",
    "        # our actor and critic networks\n",
    "        \n",
    "        self.sess=sess\n",
    "        \n",
    "        self.actor_optimizer = tf.train.RMSPropOptimizer(lr_a, name='RMSPropA')\n",
    "        self.critic_optimizer = tf.train.RMSPropOptimizer(lr_c, name='RMSPropC')\n",
    " \n",
    "        # now, if our network is global then,\n",
    "    \n",
    "        if scope == global_net_scope:\n",
    "            with tf.variable_scope(scope):\n",
    "                \n",
    "                # initialize states and build actor and critic network\n",
    "                self.s = tf.placeholder(tf.float32, [None, no_of_states], 'S')\n",
    "                \n",
    "                # get the parameters of actor and critic networks\n",
    "                self.a_params, self.c_params = self._build_net(scope)[-2:]\n",
    "                \n",
    "        # if our network is local then,\n",
    "        else:\n",
    "            with tf.variable_scope(scope):\n",
    "                \n",
    "                # initialize state, action and also target value as v_target\n",
    "                \n",
    "                self.s = tf.placeholder(tf.float32, [None, no_of_states], 'S')\n",
    "                self.a_his = tf.placeholder(tf.float32, [None, no_of_actions], 'A')\n",
    "                self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')\n",
    "                \n",
    "                # since we are in continuous actions space, we will calculate\n",
    "                # mean and variance for choosing action\n",
    "                \n",
    "                mean, var, self.v, self.a_params, self.c_params = self._build_net(scope)\n",
    "\n",
    "                # then we calculate td error as the difference between v_target - v\n",
    "                td = tf.subtract(self.v_target, self.v, name='TD_error')\n",
    "\n",
    "                # minimize the TD error\n",
    "                with tf.name_scope('critic_loss'):\n",
    "                    self.critic_loss = tf.reduce_mean(tf.square(td))\n",
    "\n",
    "                # update the mean and var value by multiplying mean with the action bound and adding var with 1e-4\n",
    "\n",
    "                with tf.name_scope('wrap_action'):\n",
    "                    mean, var = mean * action_bound[1], var + 1e-4\n",
    "                                            \n",
    "                # we can generate distribution using this updated mean and var\n",
    "                normal_dist = tf.contrib.distributions.Normal(mean, var)\n",
    "    \n",
    "                # now we shall calculate the actor loss. Recall the loss function.\n",
    "                with tf.name_scope('actor_loss'):\n",
    "                    \n",
    "                    # calculate first term of loss which is log(pi(s))\n",
    "                    log_prob = normal_dist.log_prob(self.a_his)\n",
    "                    exp_v = log_prob * td\n",
    "                    \n",
    "                    # calculate entropy from our action distribution for ensuring exploration\n",
    "                    entropy = normal_dist.entropy()\n",
    "                    \n",
    "                    # we can define our final loss as,\n",
    "                    self.exp_v = exp_v + entropy_beta * entropy\n",
    "                    \n",
    "                    # then, we try to minimize the loss\n",
    "                    self.actor_loss = tf.reduce_mean(-self.exp_v)\n",
    "                    \n",
    "                 # now, we choose action by drawing from the distribution and clipping it between action bounds,\n",
    "            \n",
    "                with tf.name_scope('choose_action'):\n",
    "                    self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), action_bound[0], action_bound[1])\n",
    "     \n",
    "                # calculate gradients for both of our actor and critic networks,\n",
    "        \n",
    "                with tf.name_scope('local_grad'):\n",
    "\n",
    "                    self.a_grads = tf.gradients(self.actor_loss, self.a_params)\n",
    "                    self.c_grads = tf.gradients(self.critic_loss, self.c_params)\n",
    " \n",
    "            # now, we update our global network weights,\n",
    "            with tf.name_scope('sync'):\n",
    "                \n",
    "                # pull the global network weights to the local networks\n",
    "                with tf.name_scope('pull'):\n",
    "                    self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]\n",
    "                    self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]\n",
    "                \n",
    "                # push the local gradients to the global network\n",
    "                with tf.name_scope('push'):\n",
    "                    self.update_a_op = self.actor_optimizer.apply_gradients(zip(self.a_grads, globalAC.a_params))\n",
    "                    self.update_c_op = self.critic_optimizer.apply_gradients(zip(self.c_grads, globalAC.c_params))\n",
    "                    \n",
    "        \n",
    "\n",
    "\n",
    "     # next, we define a function called _build_net for building our actor and critic network\n",
    "    \n",
    "     def _build_net(self, scope):\n",
    "     # initialize weights\n",
    "        w_init = tf.random_normal_initializer(0., .1)\n",
    "        \n",
    "        with tf.variable_scope('actor'):\n",
    "            l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la')\n",
    "            mean = tf.layers.dense(l_a, no_of_actions, tf.nn.tanh,kernel_initializer=w_init, name='mean')\n",
    "            var = tf.layers.dense(l_a, no_of_actions, tf.nn.softplus, kernel_initializer=w_init, name='var')\n",
    "            \n",
    "        with tf.variable_scope('critic'):\n",
    "            l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc')\n",
    "            v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')\n",
    "        \n",
    "        a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')\n",
    "        c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')\n",
    "        \n",
    "        return mean, var, v, a_params, c_params\n",
    "    \n",
    "         \n",
    "     # update the local gradients to the global network\n",
    "     def update_global(self, feed_dict):\n",
    "        self.sess.run([self.update_a_op, self.update_c_op], feed_dict)\n",
    "     \n",
    "     # get the global parameters to the local networks\n",
    "     def pull_global(self):\n",
    "        self.sess.run([self.pull_a_params_op, self.pull_c_params_op])\n",
    "     \n",
    "     # select action\n",
    "     def choose_action(self, s):\n",
    "        s = s[np.newaxis, :]\n",
    "        return self.sess.run(self.A, {self.s: s})[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    " Now we will define worker class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class Worker(object):\n",
    "    def __init__(self, name, globalAC, sess):\n",
    "        # intialize environment for each worker\n",
    "        self.env = gym.make('MountainCarContinuous-v0').unwrapped\n",
    "        self.name = name\n",
    "        \n",
    "        # create ActorCritic agent for each worker\n",
    "        self.AC = ActorCritic(name, sess, globalAC)\n",
    "        self.sess=sess\n",
    "        \n",
    "    def work(self):\n",
    "        global global_rewards, global_episodes\n",
    "        total_step = 1\n",
    " \n",
    "        # store state, action, reward\n",
    "        buffer_s, buffer_a, buffer_r = [], [], []\n",
    "        \n",
    "        # loop if the coordinator is active and global episode is less than the maximum episode\n",
    "        while not coord.should_stop() and global_episodes < no_of_episodes:\n",
    "            \n",
    "            # initialize the environment by resetting\n",
    "            s = self.env.reset()\n",
    "            \n",
    "            # store the episodic reward\n",
    "            ep_r = 0\n",
    "            for ep_t in range(no_of_ep_steps):\n",
    "    \n",
    "                # Render the environment for only worker 1\n",
    "                if self.name == 'W_0' and render:\n",
    "                    self.env.render()\n",
    "                    \n",
    "                # choose the action based on the policy\n",
    "                a = self.AC.choose_action(s)\n",
    "\n",
    "                # perform the action (a), recieve reward (r) and move to the next state (s_)\n",
    "                s_, r, done, info = self.env.step(a)\n",
    "             \n",
    "                # set done as true if we reached maximum step per episode\n",
    "                done = True if ep_t == no_of_ep_steps - 1 else False\n",
    "                \n",
    "                ep_r += r\n",
    "                \n",
    "                # store the state, action and rewards in the buffer\n",
    "                buffer_s.append(s)\n",
    "                buffer_a.append(a)\n",
    "                \n",
    "                # normalize the reward\n",
    "                buffer_r.append((r+8)/8)\n",
    "    \n",
    "    \n",
    "                # we Update the global network after particular time step\n",
    "                if total_step % update_global == 0 or done:\n",
    "                    if done:\n",
    "                        v_s_ = 0\n",
    "                    else:\n",
    "                        v_s_ = self.sess.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]\n",
    "                    \n",
    "                    # buffer for target v\n",
    "                    buffer_v_target = []\n",
    "                    \n",
    "                    for r in buffer_r[::-1]:\n",
    "                        v_s_ = r + gamma * v_s_\n",
    "                        buffer_v_target.append(v_s_)\n",
    "                        \n",
    "                    buffer_v_target.reverse()\n",
    "                    \n",
    "                    buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)\n",
    "                    feed_dict = {\n",
    "                                 self.AC.s: buffer_s,\n",
    "                                 self.AC.a_his: buffer_a,\n",
    "                                 self.AC.v_target: buffer_v_target,\n",
    "                                 }\n",
    "                    \n",
    "                    # update global network\n",
    "                    self.AC.update_global(feed_dict)\n",
    "                    buffer_s, buffer_a, buffer_r = [], [], []\n",
    "                    \n",
    "                    # get global parameters to local ActorCritic\n",
    "                    self.AC.pull_global()\n",
    "                    \n",
    "                s = s_\n",
    "                total_step += 1\n",
    "                if done:\n",
    "                    if len(global_rewards) < 5:\n",
    "                        global_rewards.append(ep_r)\n",
    "                    else:\n",
    "                        global_rewards.append(ep_r)\n",
    "                        global_rewards[-1] =(np.mean(global_rewards[-5:]))\n",
    "                    \n",
    "                    global_episodes += 1\n",
    "                    break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "Now let us start the tensorflow session and run the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# create a list for string global rewards and episodes\n",
    "global_rewards = []\n",
    "global_episodes = 0\n",
    "\n",
    "# start tensorflow session\n",
    "sess = tf.Session()\n",
    "\n",
    "with tf.device(\"/cpu:0\"):\n",
    "    \n",
    "# create an instance to our ActorCritic Class\n",
    "    global_ac = ActorCritic(global_net_scope,sess)\n",
    "    \n",
    "    workers = []\n",
    "    \n",
    "    # loop for each workers\n",
    "    for i in range(no_of_workers):\n",
    "        i_name = 'W_%i' % i\n",
    "        workers.append(Worker(i_name, global_ac,sess))\n",
    "\n",
    "coord = tf.train.Coordinator()\n",
    "sess.run(tf.global_variables_initializer())\n",
    "\n",
    "# log everything so that we can visualize the graph in tensorboard\n",
    "\n",
    "if os.path.exists(log_dir):\n",
    "    shutil.rmtree(log_dir)\n",
    "\n",
    "tf.summary.FileWriter(log_dir, sess.graph)\n",
    "\n",
    "worker_threads = []\n",
    "\n",
    "#start workers\n",
    "\n",
    "for worker in workers:\n",
    "\n",
    "    job = lambda: worker.work()\n",
    "    t = threading.Thread(target=job)\n",
    "    t.start()\n",
    "    worker_threads.append(t)\n",
    "coord.join(worker_threads)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Credits for the code used in this section goes to <a href='https://github.com/stefanbo92/A3C-Continuous'> Stefan Boschenriedter</a>"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:anaconda]",
   "language": "python",
   "name": "conda-env-anaconda-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}