{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Doom Game Using DRQN"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " First let us import all necessary libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import math\n",
    "from vizdoom import *\n",
    "import timeit\n",
    "import math\n",
    "import os\n",
    "import sys"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "Now, let us define the function get_input_shape to compute the final shape of the input image,\n",
    "once after it gets convolved as a result of convolutional layer.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_input_shape(Image,Filter,Stride):\n",
    "    layer1 = math.ceil(((Image - Filter + 1) / Stride))\n",
    "    \n",
    "    o1 = math.ceil((layer1 / Stride))\n",
    "    \n",
    "    layer2 = math.ceil(((o1 - Filter + 1) / Stride))\n",
    "    \n",
    "    o2 = math.ceil((layer2 / Stride))\n",
    "    \n",
    "    layer3 = math.ceil(((o2 - Filter + 1) / Stride))\n",
    "    \n",
    "    o3 = math.ceil((layer3  / Stride))\n",
    "\n",
    "    return int(o3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Now we define the class DRQN and implement our Deep Recurrent Q Network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class DRQN():\n",
    "    def __init__(self, input_shape, num_actions, inital_learning_rate):\n",
    "        \n",
    "        # first, we initialize all the hyperparameters\n",
    "\n",
    "        self.tfcast_type = tf.float32\n",
    "        \n",
    "        # shape of our input which would be (length, width, channels)\n",
    "        self.input_shape = input_shape  \n",
    "        \n",
    "        # number of actions in the environment\n",
    "        self.num_actions = num_actions\n",
    "        \n",
    "        # learning rate for the neural network\n",
    "        self.learning_rate = inital_learning_rate\n",
    "                \n",
    "        # now we will define the hyperparameters of the convolutional neural network \n",
    "\n",
    "        # filter size\n",
    "        self.filter_size = 5\n",
    "        \n",
    "        # number of filters\n",
    "        self.num_filters = [16, 32, 64]\n",
    "        \n",
    "        # stride size\n",
    "        self.stride = 2\n",
    "        \n",
    "        # pool size\n",
    "        self.poolsize = 2        \n",
    "        \n",
    "        # shape of our convolutional layer\n",
    "        self.convolution_shape = get_input_shape(input_shape[0], self.filter_size, self.stride) * get_input_shape(input_shape[1], self.filter_size, self.stride) * self.num_filters[2]\n",
    "        \n",
    "        # now we define the hyperparameters of our recurrent neural network and the final feed forward layer\n",
    "        \n",
    "        # number of neurons \n",
    "        self.cell_size = 100\n",
    "        \n",
    "        # number of hidden layers\n",
    "        self.hidden_layer = 50\n",
    "        \n",
    "        # drop out probability\n",
    "        self.dropout_probability = [0.3, 0.2]\n",
    "\n",
    "        # hyperparameters for optimization\n",
    "        self.loss_decay_rate = 0.96\n",
    "        self.loss_decay_steps = 180\n",
    "\n",
    "        \n",
    "        # initialize all the variables for the CNN\n",
    "\n",
    "        # we initialize the placeholder for input whose shape would be (length, width, channel)\n",
    "        self.input = tf.placeholder(shape = (self.input_shape[0], self.input_shape[1], self.input_shape[2]), dtype = self.tfcast_type)\n",
    "        \n",
    "        # we will also initialize the shape of the target vector whose shape is equal to the number of actions\n",
    "        self.target_vector = tf.placeholder(shape = (self.num_actions, 1), dtype = self.tfcast_type)\n",
    "\n",
    "        # initialize feature maps for our corresponding 3 filters\n",
    "        self.features1 = tf.Variable(initial_value = np.random.rand(self.filter_size, self.filter_size, input_shape[2], self.num_filters[0]),\n",
    "                                     dtype = self.tfcast_type)\n",
    "        \n",
    "        self.features2 = tf.Variable(initial_value = np.random.rand(self.filter_size, self.filter_size, self.num_filters[0], self.num_filters[1]),\n",
    "                                     dtype = self.tfcast_type)\n",
    "                                     \n",
    "        \n",
    "        self.features3 = tf.Variable(initial_value = np.random.rand(self.filter_size, self.filter_size, self.num_filters[1], self.num_filters[2]),\n",
    "                                     dtype = self.tfcast_type)\n",
    "\n",
    "        # initialize variables for RNN\n",
    "        # recall how RNN works from chapter 7\n",
    "        \n",
    "        self.h = tf.Variable(initial_value = np.zeros((1, self.cell_size)), dtype = self.tfcast_type)\n",
    "        \n",
    "        # hidden to hidden weight matrix\n",
    "        self.rW = tf.Variable(initial_value = np.random.uniform(\n",
    "                                            low = -np.sqrt(6. / (self.convolution_shape + self.cell_size)),\n",
    "                                            high = np.sqrt(6. / (self.convolution_shape + self.cell_size)),\n",
    "                                            size = (self.convolution_shape, self.cell_size)),\n",
    "                              dtype = self.tfcast_type)\n",
    "        \n",
    "        # input to hidden weight matrix\n",
    "        self.rU = tf.Variable(initial_value = np.random.uniform(\n",
    "                                            low = -np.sqrt(6. / (2 * self.cell_size)),\n",
    "                                            high = np.sqrt(6. / (2 * self.cell_size)),\n",
    "                                            size = (self.cell_size, self.cell_size)),\n",
    "                              dtype = self.tfcast_type)\n",
    "        \n",
    "        # hiddent to output weight matrix\n",
    "                          \n",
    "        self.rV = tf.Variable(initial_value = np.random.uniform(\n",
    "                                            low = -np.sqrt(6. / (2 * self.cell_size)),\n",
    "                                            high = np.sqrt(6. / (2 * self.cell_size)),\n",
    "                                            size = (self.cell_size, self.cell_size)),\n",
    "                              dtype = self.tfcast_type)\n",
    "        # bias\n",
    "        self.rb = tf.Variable(initial_value = np.zeros(self.cell_size), dtype = self.tfcast_type)\n",
    "        self.rc = tf.Variable(initial_value = np.zeros(self.cell_size), dtype = self.tfcast_type)\n",
    "\n",
    "        \n",
    "        # initialize weights and bias of feed forward network\n",
    "        \n",
    "        # weights\n",
    "        self.fW = tf.Variable(initial_value = np.random.uniform(\n",
    "                                            low = -np.sqrt(6. / (self.cell_size + self.num_actions)),\n",
    "                                            high = np.sqrt(6. / (self.cell_size + self.num_actions)),\n",
    "                                            size = (self.cell_size, self.num_actions)),\n",
    "                              dtype = self.tfcast_type)\n",
    "                             \n",
    "        # bias\n",
    "        self.fb = tf.Variable(initial_value = np.zeros(self.num_actions), dtype = self.tfcast_type)\n",
    "\n",
    "        # learning rate\n",
    "        self.step_count = tf.Variable(initial_value = 0, dtype = self.tfcast_type)\n",
    "        self.learning_rate = tf.train.exponential_decay(self.learning_rate,       \n",
    "                                                   self.step_count,\n",
    "                                                   self.loss_decay_steps,\n",
    "                                                   self.loss_decay_steps,\n",
    "                                                   staircase = False)\n",
    "        \n",
    "        \n",
    "        # now let us build the network\n",
    "\n",
    "        # first convolutional layer\n",
    "        self.conv1 = tf.nn.conv2d(input = tf.reshape(self.input, shape = (1, self.input_shape[0], self.input_shape[1], self.input_shape[2])), filter = self.features1, strides = [1, self.stride, self.stride, 1], padding = \"VALID\")\n",
    "        self.relu1 = tf.nn.relu(self.conv1)\n",
    "        self.pool1 = tf.nn.max_pool(self.relu1, ksize = [1, self.poolsize, self.poolsize, 1], strides = [1, self.stride, self.stride, 1], padding = \"SAME\")\n",
    "\n",
    "        # second convolutional layer\n",
    "        self.conv2 = tf.nn.conv2d(input = self.pool1, filter = self.features2, strides = [1, self.stride, self.stride, 1], padding = \"VALID\")\n",
    "        self.relu2 = tf.nn.relu(self.conv2)\n",
    "        self.pool2 = tf.nn.max_pool(self.relu2, ksize = [1, self.poolsize, self.poolsize, 1], strides = [1, self.stride, self.stride, 1], padding = \"SAME\")\n",
    "\n",
    "        # third convolutional layer\n",
    "        self.conv3 = tf.nn.conv2d(input = self.pool2, filter = self.features3, strides = [1, self.stride, self.stride, 1], padding = \"VALID\")\n",
    "        self.relu3 = tf.nn.relu(self.conv3)\n",
    "        self.pool3 = tf.nn.max_pool(self.relu3, ksize = [1, self.poolsize, self.poolsize, 1], strides = [1, self.stride, self.stride, 1], padding = \"SAME\")\n",
    "\n",
    "        # add dropout and reshape the input\n",
    "        self.drop1 = tf.nn.dropout(self.pool3, self.dropout_probability[0])\n",
    "        self.reshaped_input = tf.reshape(self.drop1, shape = [1, -1])\n",
    "\n",
    "\n",
    "        # now we build recurrent neural network which takes the input from the last layer of convolutional network\n",
    "        self.h = tf.tanh(tf.matmul(self.reshaped_input, self.rW) + tf.matmul(self.h, self.rU) + self.rb)\n",
    "        self.o = tf.nn.softmax(tf.matmul(self.h, self.rV) + self.rc)\n",
    "\n",
    "        # add drop out to RNN\n",
    "        self.drop2 = tf.nn.dropout(self.o, self.dropout_probability[1])\n",
    "        \n",
    "        # we feed the result of RNN to the feed forward layer\n",
    "        self.output = tf.reshape(tf.matmul(self.drop2, self.fW) + self.fb, shape = [-1, 1])\n",
    "        self.prediction = tf.argmax(self.output)\n",
    "\n",
    "        # compute loss\n",
    "        self.loss = tf.reduce_mean(tf.square(self.target_vector - self.output))\n",
    "        \n",
    "        # we use Adam optimizer for minimizing the error\n",
    "        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)\n",
    "        \n",
    "        # compute gradients of the loss and update the gradients\n",
    "        self.gradients = self.optimizer.compute_gradients(self.loss)\n",
    "        self.update = self.optimizer.apply_gradients(self.gradients)\n",
    "\n",
    "        self.parameters = (self.features1, self.features2, self.features3,\n",
    "                           self.rW, self.rU, self.rV, self.rb, self.rc,\n",
    "                           self.fW, self.fb)\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we define the class ExperienceReplay for implementing experience replay buffer.\n",
    "\n",
    "We store all the agent's experience i.e (state, action, rewards) in the experience replay buffer\n",
    "and  we sample this minibatch of experience for training the network,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class ExperienceReplay():\n",
    "    def __init__(self, buffer_size):\n",
    "        \n",
    "        # buffer for holding the transistion \n",
    "        self.buffer = []       \n",
    "        \n",
    "        # size of the buffer\n",
    "        self.buffer_size = buffer_size\n",
    "        \n",
    "    # we remove the old transistion if buffer size has reached it's limit. Think off the buffer as a queue when new\n",
    "    # one comes, old one goes off\n",
    "    \n",
    "    def appendToBuffer(self, memory_tuplet):\n",
    "        if len(self.buffer) > self.buffer_size: \n",
    "            for i in range(len(self.buffer) - self.buffer_size):\n",
    "                self.buffer.remove(self.buffer[0])     \n",
    "        self.buffer.append(memory_tuplet)  \n",
    "        \n",
    "        \n",
    "    # define a function called sample for sampling some random n number of transistions   \n",
    "    \n",
    "    def sample(self, n):\n",
    "        memories = []\n",
    "        \n",
    "        for i in range(n):\n",
    "            memory_index = np.random.randint(0, len(self.buffer))       \n",
    "            memories.append(self.buffer[memory_index])\n",
    "        return memories"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Now finally let us get to the training part. Yayyyyy!!!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def train(num_episodes, episode_length, learning_rate, scenario = \"deathmatch.cfg\", map_path = 'map02', render = False):\n",
    "  \n",
    "    # discount parameter for Q-value computation\n",
    "    discount_factor = .99\n",
    "    \n",
    "    # frequency for updating the experience in the buffer\n",
    "    update_frequency = 5\n",
    "    store_frequency = 50\n",
    "    \n",
    "    # for printing the output\n",
    "    print_frequency = 1000\n",
    "\n",
    "    # initialize variables for storing total rewards and total loss\n",
    "    total_reward = 0\n",
    "    total_loss = 0\n",
    "    old_q_value = 0\n",
    "\n",
    "    # initialize lists for storing the episodic rewards and losses \n",
    "    rewards = []\n",
    "    losses = []\n",
    "\n",
    "    # okay, now let us get to the action!\n",
    "   \n",
    "    # first, we initialize our doomgame environment\n",
    "    game = DoomGame()\n",
    "    \n",
    "    # specify the path where our scenario file is located\n",
    "    game.set_doom_scenario_path(scenario)\n",
    "    \n",
    "    # specify the path of map file\n",
    "    game.set_doom_map(map_path)\n",
    "\n",
    "    # then we set screen resolution and screen format\n",
    "    game.set_screen_resolution(ScreenResolution.RES_256X160)    \n",
    "    game.set_screen_format(ScreenFormat.RGB24)\n",
    "\n",
    "    # we can add particles and effetcs we needed by simply setting them to true or false\n",
    "    game.set_render_hud(False)\n",
    "    game.set_render_minimal_hud(False)\n",
    "    game.set_render_crosshair(False)\n",
    "    game.set_render_weapon(True)\n",
    "    game.set_render_decals(False)\n",
    "    game.set_render_particles(False)\n",
    "    game.set_render_effects_sprites(False)\n",
    "    game.set_render_messages(False)\n",
    "    game.set_render_corpses(False)\n",
    "    game.set_render_screen_flashes(True)\n",
    "\n",
    "    # now we will specify buttons that should be available to the agent\n",
    "    game.add_available_button(Button.MOVE_LEFT)\n",
    "    game.add_available_button(Button.MOVE_RIGHT)\n",
    "    game.add_available_button(Button.TURN_LEFT)\n",
    "    game.add_available_button(Button.TURN_RIGHT)\n",
    "    game.add_available_button(Button.MOVE_FORWARD)\n",
    "    game.add_available_button(Button.MOVE_BACKWARD)\n",
    "    game.add_available_button(Button.ATTACK)\n",
    "    \n",
    "   \n",
    "    # okay,now we will add one more button called delta. The above button will only work \n",
    "    # like a keyboard keys and will have only boolean values. \n",
    "\n",
    "    # so we use delta button which emulates a mouse device which will have positive and negative values\n",
    "    # and it will be useful in environment for exploring\n",
    "    \n",
    "    game.add_available_button(Button.TURN_LEFT_RIGHT_DELTA, 90)\n",
    "    game.add_available_button(Button.LOOK_UP_DOWN_DELTA, 90)\n",
    "\n",
    "    # initialize an array for actions\n",
    "    actions = np.zeros((game.get_available_buttons_size(), game.get_available_buttons_size()))\n",
    "    count = 0\n",
    "    for i in actions:\n",
    "        i[count] = 1\n",
    "        count += 1\n",
    "    actions = actions.astype(int).tolist()\n",
    "\n",
    "\n",
    "    # then we add the game variables, ammo, health, and killcount\n",
    "    game.add_available_game_variable(GameVariable.AMMO0)\n",
    "    game.add_available_game_variable(GameVariable.HEALTH)\n",
    "    game.add_available_game_variable(GameVariable.KILLCOUNT)\n",
    "\n",
    "    # we set episode_timeout to terminate the episode after some time step\n",
    "    # we also set episode_start_time which is useful for skipping intial events\n",
    "    \n",
    "    game.set_episode_timeout(6 * episode_length)\n",
    "    game.set_episode_start_time(10)\n",
    "    game.set_window_visible(render)\n",
    "    \n",
    "    # we can also enable sound by setting set_sound_enable to true\n",
    "    game.set_sound_enabled(False)\n",
    "\n",
    "    # we set living reward to 0 which the agent for each move it does even though the move is not useful\n",
    "    game.set_living_reward(0)\n",
    "\n",
    "    # doom has different modes such as player, spectator, asynchronous player and asynchronous spectator\n",
    "    \n",
    "    # in spectator mode humans will play and agent will learn from it.\n",
    "    # in player mode, agent actually plays the game, so we use player mode.\n",
    "    \n",
    "    game.set_mode(Mode.PLAYER)\n",
    "\n",
    "    # okay, So now we, initialize the game environment\n",
    "    game.init()\n",
    "\n",
    "    # now, let us create instance to our DRQN class and create our both actor and target DRQN networks\n",
    "    actionDRQN = DRQN((160, 256, 3), game.get_available_buttons_size() - 2, learning_rate)\n",
    "    targetDRQN = DRQN((160, 256, 3), game.get_available_buttons_size() - 2, learning_rate)\n",
    "    \n",
    "    # we will also create instance to the ExperienceReplay class with the buffer size of 1000\n",
    "    experiences = ExperienceReplay(1000)\n",
    "\n",
    "    # for storing the models\n",
    "    saver = tf.train.Saver({v.name: v for v in actionDRQN.parameters}, max_to_keep = 1)\n",
    "\n",
    "    \n",
    "    # now let us start the training process\n",
    "    # we initialize variables for sampling and storing transistions from the experience buffer\n",
    "    sample = 5\n",
    "    store = 50\n",
    "   \n",
    "    # start the tensorflow session\n",
    "    with tf.Session() as sess:\n",
    "        \n",
    "        # initialize all tensorflow variables\n",
    "        \n",
    "        sess.run(tf.global_variables_initializer())\n",
    "        \n",
    "        for episode in range(num_episodes):\n",
    "            \n",
    "            # start the new episode\n",
    "            game.new_episode()\n",
    "            \n",
    "            # play the episode till it reaches the episode length\n",
    "            for frame in range(episode_length):\n",
    "                \n",
    "                # get the game state\n",
    "                state = game.get_state()\n",
    "                s = state.screen_buffer\n",
    "                \n",
    "                # select the action\n",
    "                a = actionDRQN.prediction.eval(feed_dict = {actionDRQN.input: s})[0]\n",
    "                action = actions[a]\n",
    "                \n",
    "                # perform the action and store the reward\n",
    "                reward = game.make_action(action)\n",
    "                \n",
    "                # update total rewad\n",
    "                total_reward += reward\n",
    "\n",
    "               \n",
    "                # if the episode is over then break\n",
    "                if game.is_episode_finished():\n",
    "                    break\n",
    "                 \n",
    "                # store transistion to our experience buffer\n",
    "                if (frame % store) == 0:\n",
    "                    experiences.appendToBuffer((s, action, reward))\n",
    "\n",
    "                # sample experience form the experience buffer        \n",
    "                if (frame % sample) == 0:\n",
    "                    memory = experiences.sample(1)\n",
    "                    mem_frame = memory[0][0]\n",
    "                    mem_reward = memory[0][2]\n",
    "                    \n",
    "                    \n",
    "                    # now, train the network\n",
    "                    Q1 = actionDRQN.output.eval(feed_dict = {actionDRQN.input: mem_frame})\n",
    "                    Q2 = targetDRQN.output.eval(feed_dict = {targetDRQN.input: mem_frame})\n",
    "\n",
    "                    # set learning rate\n",
    "                    learning_rate = actionDRQN.learning_rate.eval()\n",
    "\n",
    "                    # calculate Q value\n",
    "                    Qtarget = old_q_value + learning_rate * (mem_reward + discount_factor * Q2 - old_q_value)    \n",
    "                    \n",
    "                    # update old Q value\n",
    "                    old_q_value = Qtarget\n",
    "\n",
    "                    # compute Loss\n",
    "                    loss = actionDRQN.loss.eval(feed_dict = {actionDRQN.target_vector: Qtarget, actionDRQN.input: mem_frame})\n",
    "                    \n",
    "                    # update total loss\n",
    "                    total_loss += loss\n",
    "\n",
    "                    # update both networks\n",
    "                    actionDRQN.update.run(feed_dict = {actionDRQN.target_vector: Qtarget, actionDRQN.input: mem_frame})\n",
    "                    targetDRQN.update.run(feed_dict = {targetDRQN.target_vector: Qtarget, targetDRQN.input: mem_frame})\n",
    "\n",
    "            rewards.append((episode, total_reward))\n",
    "            losses.append((episode, total_loss))\n",
    "\n",
    "            \n",
    "            print(\"Episode %d - Reward = %.3f, Loss = %.3f.\" % (episode, total_reward, total_loss))\n",
    "\n",
    "\n",
    "            total_reward = 0\n",
    "            total_loss = 0\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Episode 0 - Reward = 0.000, Loss = 0.234.\n",
      "Episode 1 - Reward = 0.000, Loss = 0.800.\n"
     ]
    }
   ],
   "source": [
    "train(num_episodes = 10000, episode_length = 300, learning_rate = 0.01, render = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font size=3.5> Credits for the code used in this section goes to <a href='https://github.com/Luthanicus/losaltoshackathon-drqn'> Luthanicus </a>"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:universe]",
   "language": "python",
   "name": "conda-env-universe-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}