{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Swinging Up the Pendulum Using DDPG" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Goal:\n", "\n", "\n", "Say, we have a pendulum which starts in a random position and the goal of our agent is to swing the pendulum up so that it stays upright." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, let us import necessary libraries\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import tensorflow as tf\n", "import numpy as np\n", "import gym" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Now let us define the hyperparameters." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# number of steps in each episode\n", "epsiode_steps = 500 \n", "\n", "# learning rate for actor\n", "lr_a = 0.001 \n", "\n", "# learning rate for critic\n", "lr_c = 0.002 \n", "\n", "# discount factor\n", "gamma = 0.9 \n", "\n", "# soft replacement\n", "alpha = 0.01 \n", "\n", "# replay buffer size\n", "memory = 10000 \n", "\n", "# batch size for training\n", "batch_size = 32 \n", "render = False" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Next, we define the class DDPG which implements the DDPG algorithm. " ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "class DDPG(object):\n", " def __init__(self, no_of_actions, no_of_states, a_bound,):\n", " \n", " # initialize the memory with shape as no of actions, no of states and our defined memory size\n", " self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)\n", " \n", " # initialize pointer to point to our experience buffer\n", " self.pointer = 0\n", " \n", " # initialize tensorflow session\n", " self.sess = tf.Session()\n", " \n", " # initialize the variance for OU process for exploring policies\n", " self.noise_variance = 3.0\n", " \n", " self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,\n", " \n", " # placeholder for current state, next state and rewards\n", " self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')\n", " self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')\n", " self.reward = tf.placeholder(tf.float32, [None, 1], 'r')\n", " \n", " # build the actor network which has separate eval(primary) and target network\n", " with tf.variable_scope('Actor'):\n", " self.a = self.build_actor_network(self.state, scope='eval', trainable=True)\n", " a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)\n", " \n", " # build the critic network which has separate eval(primary) and target network \n", " with tf.variable_scope('Critic'):\n", " q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)\n", " q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)\n", " \n", "\n", " # initialize the network parameters\n", " self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')\n", " self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')\n", " \n", " self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')\n", " self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')\n", "\n", " # update target value\n", " self.soft_replace = [[tf.assign(at, (1-alpha)*at+alpha*ae), tf.assign(ct, (1-alpha)*ct+alpha*ce)]\n", " for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]\n", " \n", " \n", " # compute target Q value, we know that Q(s,a) = reward + gamma * Q'(s',a')\n", " q_target = self.reward + gamma * q_\n", " \n", " \n", " # compute TD error i.e actual - predicted values\n", " td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)\n", " \n", " # train the critic network with adam optimizer\n", " self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name=\"adam-ink\", var_list = self.ce_params)\n", " \n", " # compute the loss in actor network\n", " a_loss = - tf.reduce_mean(q) \n", " \n", " # train the actor network with adam optimizer for minimizing the loss\n", " self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)\n", "\n", " # initialize summary writer to visualize our network in tensorboard\n", " tf.summary.FileWriter(\"logs\", self.sess.graph)\n", " \n", " # initialize all variables\n", " self.sess.run(tf.global_variables_initializer())\n", "\n", " \n", "\n", " # How do we select acion in DDPG? We select action by adding noise to the action space. We use\n", " # Ornstein-Uhlenbeck random process for generating noise\n", "\n", " def choose_action(self, s):\n", " a = self.sess.run(self.a, {self.state: s[np.newaxis, :]})[0]\n", " a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)\n", " \n", " return a\n", " \n", " \n", " # then we define the function called learn where the actual training happens,\n", " # here we select a minibatch of states, actions, rewards and next state from the experience buffer\n", " # and we train actor and critic network\n", "\n", " def learn(self):\n", " \n", " # soft target replacement\n", " self.sess.run(self.soft_replace)\n", "\n", " indices = np.random.choice(memory, size=batch_size)\n", " batch_transition = self.memory[indices, :]\n", " batch_states = batch_transition[:, :self.no_of_states]\n", " batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]\n", " batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]\n", " batch_next_state = batch_transition[:, -self.no_of_states:]\n", "\n", " self.sess.run(self.atrain, {self.state: batch_states})\n", " self.sess.run(self.ctrain, {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards, self.next_state: batch_next_state})\n", "\n", " # we define a function store_transition which stores all the transition information in the buffer\n", " def store_transition(self, s, a, r, s_):\n", " trans = np.hstack((s,a,[r],s_))\n", " \n", " index = self.pointer % memory\n", " self.memory[index, :] = trans\n", " self.pointer += 1\n", "\n", " if self.pointer > memory:\n", " self.noise_variance *= 0.99995\n", " self.learn()\n", " \n", " \n", " # we define the function build_actor_network for builing our actor network \n", " def build_actor_network(self, s, scope, trainable):\n", " # Actor DPG\n", " with tf.variable_scope(scope):\n", " l1 = tf.layers.dense(s, 30, activation = tf.nn.tanh, name = 'l1', trainable = trainable)\n", " a = tf.layers.dense(l1, self.no_of_actions, activation = tf.nn.tanh, name = 'a', trainable = trainable) \n", " return tf.multiply(a, self.a_bound, name = \"scaled_a\") \n", "\n", "\n", " # followed by we define the function build_crtic_network which build our critic network\n", " def build_crtic_network(self, s, a, scope, trainable):\n", " with tf.variable_scope(scope):\n", " n_l1 = 30\n", " w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable = trainable)\n", " w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable = trainable)\n", " b1 = tf.get_variable('b1', [1, n_l1], trainable = trainable)\n", " net = tf.nn.tanh( tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1 )\n", "\n", " q = tf.layers.dense(net, 1, trainable = trainable)\n", " return q\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, we see how to apply DDPG for swinging up the pendulum" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " First, we initialize our gym environment using make function " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2018-06-11 15:50:38,150] Making new env: Pendulum-v0\n" ] }, { "data": { "text/plain": [ "[1]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "env = gym.make(\"Pendulum-v0\")\n", "env = env.unwrapped\n", "env.seed(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Get the number od states and actions" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "no_of_states = env.observation_space.shape[0]\n", "no_of_actions = env.action_space.shape[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As it is a continous action space get the higher bound of the action" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "a_bound = env.action_space.high" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, we create an object to our DDPG class" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "ddpg = DDPG(no_of_actions, no_of_states, a_bound)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# for storing the total rewards\n", "total_reward = []\n", "\n", "# set the number of episodes\n", "no_of_episodes = 300\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "# for each episodes \n", "for i in range(no_of_episodes):\n", " # initialize the environment\n", " s = env.reset()\n", " \n", " # episodic reward\n", " ep_reward = 0\n", " \n", " for j in range(epsiode_steps):\n", " \n", " env.render()\n", "\n", " # select action by adding noise through OU process\n", " a = ddpg.choose_action(s)\n", " \n", " # peform the action and move to the next state s\n", " s_, r, done, info = env.step(a)\n", " \n", " # store the the transition to our experience buffer \n", " # sample some minibatch of experience and train the network\n", " ddpg.store_transition(s, a, r, s_)\n", " \n", " # update current state as next state\n", " s = s_\n", " \n", " # add episodic rewards\n", " ep_reward += r\n", " \n", " if j == epsiode_steps-1:\n", " \n", " # store the total rewards\n", " total_reward.append(ep_reward)\n", " \n", " # print rewards obtained per each episode\n", " print('Episode:', i, ' Reward: %i' % int(ep_reward))\n", " \n", " break\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Credits for the code used in this section goes to wshuail " ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:universe]", "language": "python", "name": "conda-env-universe-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.4" } }, "nbformat": 4, "nbformat_minor": 2 }