{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Lunar Lander Using Policy Gradients"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "\n",
    "\n",
    "Say our agent is driving the space vehicle and goal of our agent is to land correctly on the landing pad. If our agent(lander) lands away from the landing pad then it loses the reward and the episode will get terminated\n",
    "if the agent crashes or comes to rest. \n",
    "\n",
    "Four discrete actions available in the environment are do nothing, fire left orientation engine, fire main engine, fire right orientation engine.\n",
    "\n",
    "Now, we will see how to train our agents to correctly land on the landing pad with Policy Gradients\n",
    "\n",
    "Credits for the code: <a href='https://github.com/gabrielgarza/openai-gym-policy-gradient'> Gabriel </a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "First we import necessary libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "from tensorflow.python.framework import ops\n",
    "import gym"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Then we define the PolicyGradient class for implementing the policy gradient"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class PolicyGradient:   \n",
    "    \n",
    "    # first we define the __init__ method where we initialize all variables\n",
    "    \n",
    "    def __init__(self, n_x,n_y,learning_rate=0.01, reward_decay=0.95):\n",
    "            \n",
    "        # number of states in the environemnt    \n",
    "        self.n_x = n_x \n",
    "        \n",
    "        # number of actions in the environment\n",
    "        self.n_y = n_y\n",
    "        \n",
    "        # learning rate of the network\n",
    "        self.lr = learning_rate\n",
    "        \n",
    "        # discount factor\n",
    "        self.gamma = reward_decay \n",
    "    \n",
    "        # initialize the lists for storing observations, actions and rewards\n",
    "        self.episode_observations, self.episode_actions, self.episode_rewards = [], [], []\n",
    "        \n",
    "        # we define a function called build_network for building the neural network\n",
    "        self.build_network()\n",
    "        \n",
    "        # stores the cost i.e loss\n",
    "        self.cost_history = []\n",
    "        \n",
    "        # initialize tensorflow session\n",
    "        self.sess = tf.Session()\n",
    "        self.sess.run(tf.global_variables_initializer())\n",
    "        \n",
    "\n",
    "    # next we define a function called store_transition which stores the transition information\n",
    "    # i.e state, action, reward and we can use this transitions for training the network\n",
    "\n",
    "    def store_transition(self, s, a, r):\n",
    "        \n",
    "        self.episode_observations.append(s)\n",
    "        self.episode_rewards.append(r)\n",
    "\n",
    "        # store actions as list of arrays\n",
    "        action = np.zeros(self.n_y)\n",
    "        action[a] = 1\n",
    "        self.episode_actions.append(action)\n",
    "        \n",
    "    # now, we define a function choose_action for choosing the action given the state,\n",
    "\n",
    "    def choose_action(self, observation):\n",
    "\n",
    "        # reshape observation to (num_features, 1)\n",
    "        observation = observation[:, np.newaxis]\n",
    "\n",
    "        # run forward propagation to get softmax probabilities\n",
    "        prob_weights = self.sess.run(self.outputs_softmax, feed_dict = {self.X: observation})\n",
    "\n",
    "        # select action using a biased sample this will return the index of the action we have sampled\n",
    "        action = np.random.choice(range(len(prob_weights.ravel())), p=prob_weights.ravel())\n",
    "        \n",
    "        return action\n",
    "\n",
    "    \n",
    "    # we define build_network for creating our neural network\n",
    "    \n",
    "    def build_network(self):\n",
    "        \n",
    "        # placeholders for input x, and output y\n",
    "        self.X = tf.placeholder(tf.float32, shape=(self.n_x, None), name=\"X\")\n",
    "        self.Y = tf.placeholder(tf.float32, shape=(self.n_y, None), name=\"Y\")\n",
    "        \n",
    "        # placeholder for reward\n",
    "        self.discounted_episode_rewards_norm = tf.placeholder(tf.float32, [None, ], name=\"actions_value\")\n",
    "\n",
    "        # we build 3 layer neural network with 2 hidden layers and 1 output layer\n",
    "        \n",
    "        # number of neurons in the hidden layer\n",
    "        units_layer_1 = 10\n",
    "        units_layer_2 = 10\n",
    "        \n",
    "        # number of neurons in the output layer\n",
    "        units_output_layer = self.n_y\n",
    "        \n",
    "        # now let us initialize weights and bias value using tensorflow's tf.contrib.layers.xavier_initializer\n",
    "        \n",
    "        W1 = tf.get_variable(\"W1\", [units_layer_1, self.n_x], initializer = tf.contrib.layers.xavier_initializer(seed=1))\n",
    "        b1 = tf.get_variable(\"b1\", [units_layer_1, 1], initializer = tf.contrib.layers.xavier_initializer(seed=1))\n",
    "        W2 = tf.get_variable(\"W2\", [units_layer_2, units_layer_1], initializer = tf.contrib.layers.xavier_initializer(seed=1))\n",
    "        b2 = tf.get_variable(\"b2\", [units_layer_2, 1], initializer = tf.contrib.layers.xavier_initializer(seed=1))\n",
    "        W3 = tf.get_variable(\"W3\", [self.n_y, units_layer_2], initializer = tf.contrib.layers.xavier_initializer(seed=1))\n",
    "        b3 = tf.get_variable(\"b3\", [self.n_y, 1], initializer = tf.contrib.layers.xavier_initializer(seed=1))\n",
    "\n",
    "        # and then, we perform forward propagation\n",
    "\n",
    "        Z1 = tf.add(tf.matmul(W1,self.X), b1)\n",
    "        A1 = tf.nn.relu(Z1)\n",
    "        Z2 = tf.add(tf.matmul(W2, A1), b2)\n",
    "        A2 = tf.nn.relu(Z2)\n",
    "        Z3 = tf.add(tf.matmul(W3, A2), b3)\n",
    "        A3 = tf.nn.softmax(Z3)\n",
    "\n",
    "\n",
    "        # as we require, probabilities, we apply softmax activation function in the output layer,\n",
    "        \n",
    "        logits = tf.transpose(Z3)\n",
    "        labels = tf.transpose(self.Y)\n",
    "        self.outputs_softmax = tf.nn.softmax(logits, name='A3')\n",
    "\n",
    "        # next we define our loss function as cross entropy loss\n",
    "        neg_log_prob = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels)\n",
    "        \n",
    "        # reward guided loss\n",
    "        loss = tf.reduce_mean(neg_log_prob * self.discounted_episode_rewards_norm)  \n",
    "\n",
    "        # we use adam optimizer for minimizing the loss\n",
    "        self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)\n",
    "\n",
    "\n",
    "    # define discount_and_norm_rewards function which will result the discounted and normalised reward\n",
    "    \n",
    "    def discount_and_norm_rewards(self):\n",
    "        discounted_episode_rewards = np.zeros_like(self.episode_rewards)\n",
    "        cumulative = 0\n",
    "        for t in reversed(range(len(self.episode_rewards))):\n",
    "            cumulative = cumulative * self.gamma + self.episode_rewards[t]\n",
    "            discounted_episode_rewards[t] = cumulative\n",
    "\n",
    "        discounted_episode_rewards -= np.mean(discounted_episode_rewards)\n",
    "        discounted_episode_rewards /= np.std(discounted_episode_rewards)\n",
    "        return discounted_episode_rewards\n",
    "    \n",
    "    # now we actually learn i.e train our network\n",
    "    \n",
    "    def learn(self):\n",
    "        # discount and normalize episodic reward\n",
    "        discounted_episode_rewards_norm = self.discount_and_norm_rewards()\n",
    "\n",
    "        # train the nework\n",
    "        self.sess.run(self.train_op, feed_dict={\n",
    "             self.X: np.vstack(self.episode_observations).T,\n",
    "             self.Y: np.vstack(np.array(self.episode_actions)).T,\n",
    "             self.discounted_episode_rewards_norm: discounted_episode_rewards_norm,\n",
    "        })\n",
    "\n",
    "        # reset the episodic data\n",
    "        self.episode_observations, self.episode_actions, self.episode_rewards  = [], [], []\n",
    "\n",
    "        return discounted_episode_rewards_norm\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Now let us initialize our gym environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2018-06-11 15:49:43,082] Making new env: LunarLander-v2\n"
     ]
    }
   ],
   "source": [
    "env = gym.make('LunarLander-v2')\n",
    "env = env.unwrapped"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Initialize variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RENDER_ENV = False\n",
    "EPISODES = 5000\n",
    "rewards = []\n",
    "RENDER_REWARD_MIN = 5000"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Create an instance to our PolicyGradient class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "PG = PolicyGradient(\n",
    "    n_x = env.observation_space.shape[0],\n",
    "    n_y = env.action_space.n,\n",
    "    learning_rate=0.02,\n",
    "    reward_decay=0.99,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Start the action! Now!!!!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for episode in range(EPISODES):\n",
    "    \n",
    "    # get the state\n",
    "    observation = env.reset()\n",
    "    episode_reward = 0\n",
    "\n",
    "\n",
    "    while True:\n",
    "        \n",
    "        if RENDER_ENV: env.render()\n",
    "\n",
    "        # choose an action based on the state\n",
    "        action = PG.choose_action(observation)\n",
    "\n",
    "        # perform action in the environment and move to next state and receive reward\n",
    "        observation_, reward, done, info = env.step(action)\n",
    "\n",
    "        # store the transition information\n",
    "        PG.store_transition(observation, action, reward)\n",
    "        \n",
    "        # sum the rewards obtained in each episode\n",
    "        episode_rewards_sum = sum(PG.episode_rewards)\n",
    "        \n",
    "        # if the reward is less than -259 then terminate the episode\n",
    "        if episode_rewards_sum < -250:\n",
    "            done = True\n",
    "    \n",
    "        if done:\n",
    "            episode_rewards_sum = sum(PG.episode_rewards)\n",
    "            rewards.append(episode_rewards_sum)\n",
    "            max_reward_so_far = np.amax(rewards)\n",
    "\n",
    "            print(\"Episode: \", episode)\n",
    "            print(\"Reward: \", episode_rewards_sum)\n",
    "            print(\"Max reward so far: \", max_reward_so_far)\n",
    "\n",
    "            # train the network\n",
    "            discounted_episode_rewards_norm = PG.learn()\n",
    "\n",
    "            if max_reward_so_far > RENDER_REWARD_MIN: RENDER_ENV = False\n",
    "\n",
    "\n",
    "            break\n",
    "\n",
    "        # update the next state as current state\n",
    "        observation = observation_\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:universe]",
   "language": "python",
   "name": "conda-env-universe-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}