{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Training the network"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "\n",
    "Now we will see how to train the network,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class DQN(object):\n",
    "\n",
    "    # First we define the class called DQN and initialize all varaiables in __init__ method\n",
    "\n",
    "    def __init__(self, state_size,\n",
    "                       action_size,\n",
    "                       session,\n",
    "                       summary_writer = None,\n",
    "                       exploration_period = 1000,\n",
    "                       minibatch_size = 32,\n",
    "                       discount_factor = 0.99,\n",
    "                       experience_replay_buffer = 10000,\n",
    "                       target_qnet_update_frequency = 10000,\n",
    "                       initial_exploration_epsilon = 1.0,\n",
    "                       final_exploration_epsilon = 0.05,\n",
    "                       reward_clipping = -1,\n",
    "                        ):\n",
    "\n",
    "       \n",
    "        self.state_size = state_size\n",
    "        self.action_size = action_size\n",
    "\n",
    "\n",
    "        self.session = session\n",
    "        self.exploration_period = float(exploration_period)\n",
    "        self.minibatch_size = minibatch_size\n",
    "        self.discount_factor = tf.constant(discount_factor)\n",
    "        self.experience_replay_buffer = experience_replay_buffer\n",
    "        self.summary_writer = summary_writer\n",
    "        self.reward_clipping = reward_clipping\n",
    "\n",
    "\n",
    "        self.target_qnet_update_frequency = target_qnet_update_frequency\n",
    "        self.initial_exploration_epsilon = initial_exploration_epsilon\n",
    "        self.final_exploration_epsilon = final_exploration_epsilon\n",
    "        self.num_training_steps = 0\n",
    "\n",
    "\n",
    "        # initialize primary DDQN by creating an instance to our QNetworkDueling class\n",
    "        self.qnet = QNetworkDueling(self.state_size, self.action_size, \"qnet\")\n",
    "\n",
    "        # similarly initialize the Target DDQN\n",
    "        self.target_qnet = QNetworkDueling(self.state_size, self.action_size, \"target_qnet\")\n",
    "\n",
    "        # Next initialize the optimizer as a RMSPropOptimizer\n",
    "        self.qnet_optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.99, epsilon=0.01) \n",
    "\n",
    "        # Now, initialize experience replay buffer by creating instance to our ReplayMemoryFast class\n",
    "        self.experience_replay = ReplayMemoryFast(self.experience_replay_buffer, self.minibatch_size)\n",
    "\n",
    "        # Setup the computation graph\n",
    "        self.create_graph()\n",
    "\n",
    "\n",
    "    # Next we define the function called copy_to_target_network for copying weights from the\n",
    "    # primary network to our target network\n",
    "\n",
    "    def copy_to_target_network(source_network, target_network):\n",
    "        target_network_update = []\n",
    "        for v_source, v_target in zip(source_network.variables(), target_network.variables()):\n",
    "            # this is equivalent to target = source\n",
    "            update_op = v_target.assign(v_source)\n",
    "            target_network_update.append(update_op)\n",
    "        return tf.group(*target_network_update)\n",
    "\n",
    "    \n",
    "    # Now we define the function called create graph and build our computation graph\n",
    "    def create_graph(self):\n",
    "\n",
    "        # we calculate Q values and select the action that has maximum Q value\n",
    "        with tf.name_scope(\"pick_action\"):\n",
    "            \n",
    "            # placeholder for state\n",
    "            self.state = tf.placeholder(tf.float32, (None,)+self.state_size , name=\"state\")\n",
    "\n",
    "            # placeholder for q values\n",
    "            self.q_values = tf.identity(self.qnet(self.state) , name=\"q_values\")\n",
    "\n",
    "            # placeholder for predicted actions\n",
    "            self.predicted_actions = tf.argmax(self.q_values, dimension=1 , name=\"predicted_actions\")\n",
    "\n",
    "            # plot as a historgram to track max q values\n",
    "            tf.histogram_summary(\"Q values\", tf.reduce_mean(tf.reduce_max(self.q_values, 1))) # save max q-values to track learning\n",
    "\n",
    "\n",
    "       \n",
    "       # Next we calculate target future reward\n",
    "        with tf.name_scope(\"estimating_future_rewards\"):\n",
    "          \n",
    "            self.next_state = tf.placeholder(tf.float32, (None,)+self.state_size , name=\"next_state\")\n",
    "            self.next_state_mask = tf.placeholder(tf.float32, (None,) , name=\"next_state_mask\") # 0 for terminal states\n",
    "            self.rewards = tf.placeholder(tf.float32, (None,) , name=\"rewards\")\n",
    "\n",
    "            self.next_q_values_targetqnet = tf.stop_gradient(self.target_qnet(self.next_state), name=\"next_q_values_targetqnet\")\n",
    "\n",
    "    \n",
    "            self.next_q_values_qnet = tf.stop_gradient(self.qnet(self.next_state), name=\"next_q_values_qnet\")\n",
    "            self.next_selected_actions = tf.argmax(self.next_q_values_qnet, dimension=1)\n",
    "            self.next_selected_actions_onehot = tf.one_hot(indices=self.next_selected_actions, depth=self.action_size)\n",
    "\n",
    "            self.next_max_q_values = tf.stop_gradient( tf.reduce_sum( tf.mul( self.next_q_values_targetqnet, self.next_selected_actions_onehot ) , reduction_indices=[1,] ) * self.next_state_mask )\n",
    "\n",
    "\n",
    "            self.target_q_values = self.rewards + self.discount_factor*self.next_max_q_values\n",
    "\n",
    "\n",
    "\n",
    "        # perform the optimization\n",
    "        with tf.name_scope(\"optimization_step\"):\n",
    "            self.action_mask = tf.placeholder(tf.float32, (None, self.action_size) , name=\"action_mask\") \n",
    "            self.y = tf.reduce_sum( self.q_values * self.action_mask , reduction_indices=[1,])\n",
    "\n",
    "            # clip the errors\n",
    "            self.error = tf.abs(self.y - self.target_q_values)\n",
    "            quadratic_part = tf.clip_by_value(self.error, 0.0, 1.0)\n",
    "            linear_part = self.error - quadratic_part\n",
    "            self.loss = tf.reduce_mean( 0.5*tf.square(quadratic_part) + linear_part )\n",
    "\n",
    "            # optimize the gradients\n",
    "            qnet_gradients = self.qnet_optimizer.compute_gradients(self.loss, self.qnet.variables())\n",
    "\n",
    "            for i, (grad, var) in enumerate(qnet_gradients):\n",
    "                if grad is not None:\n",
    "                    qnet_gradients[i] = (tf.clip_by_norm(grad, 10), var)\n",
    "\n",
    "            self.qnet_optimize = self.qnet_optimizer.apply_gradients(qnet_gradients)\n",
    "\n",
    "        # Copy the primary network weights to the target network\n",
    "        with tf.name_scope(\"target_network_update\"):\n",
    "            self.hard_copy_to_target = DQN.copy_to_target_network(self.qnet, self.target_qnet)\n",
    "\n",
    "\n",
    "\n",
    "    # We define the function called store for storing all the experience in the experience replay buffer\n",
    "\n",
    "    def store(self, state, action, reward, next_state, is_terminal):\n",
    "        # rewards clipping\n",
    "        if self.reward_clipping > 0.0:\n",
    "            reward = np.clip(reward, -self.reward_clipping, self.reward_clipping)\n",
    "\n",
    "        self.experience_replay.store(state, action, reward, next_state, is_terminal)\n",
    "\n",
    "\n",
    "    # We define a function called action for selecting actions using decaying epsilon greedy policy\n",
    "    \n",
    "    def action(self, state, training = False):\n",
    "     \n",
    "        if self.num_training_steps > self.exploration_period:\n",
    "            epsilon = self.final_exploration_epsilon\n",
    "        else:\n",
    "            epsilon =  self.initial_exploration_epsilon - float(self.num_training_steps) * (self.initial_exploration_epsilon - self.final_exploration_epsilon) / self.exploration_period\n",
    "\n",
    "        if not training:\n",
    "            epsilon = 0.05\n",
    "\n",
    "        # execute a random action with probability epsilon, or follow the QNet policy with probability 1-epsilon.\n",
    "        if random.random() <= epsilon:\n",
    "            action = random.randint(0, self.action_size-1)\n",
    "        else:\n",
    "            action = self.session.run(self.predicted_actions, {self.state:[state] } )[0]\n",
    "\n",
    "        return action\n",
    "\n",
    "\n",
    "    # Now we define a function called train for training our network\n",
    "\n",
    "    def train(self):\n",
    "        # Copy the QNetwork weights to the Target QNetwork.\n",
    "        if self.num_training_steps == 0:\n",
    "            print \"Training starts...\"\n",
    "            self.qnet.copy_to(self.target_qnet)\n",
    "\n",
    "\n",
    "        # Sample experience from replay memory\n",
    "        minibatch = self.experience_replay.sample()\n",
    "        if len(minibatch)==0:\n",
    "            return\n",
    "\n",
    "\n",
    "        # get the states, actions, rewards and next states from the minibatch\n",
    "        batch_states = np.asarray( [d[0] for d in minibatch] )\n",
    "        actions = [d[1] for d in minibatch]\n",
    "        batch_actions = np.zeros( (self.minibatch_size, self.action_size) )\n",
    "        for i in xrange(self.minibatch_size):\n",
    "            batch_actions[i, actions[i]] = 1\n",
    "\n",
    "        batch_rewards = np.asarray( [d[2] for d in minibatch] )\n",
    "        batch_newstates = np.asarray( [d[3] for d in minibatch] )\n",
    "\n",
    "        batch_newstates_mask = np.asarray( [not d[4] for d in minibatch] )\n",
    "\n",
    "\n",
    "        # Perform the training operation\n",
    "        scores, _, = self.session.run([self.q_values, self.qnet_optimize],\n",
    "                                      { self.state: batch_states,\n",
    "                                        self.next_state: batch_newstates,\n",
    "                                        self.next_state_mask: batch_newstates_mask,\n",
    "                                        self.rewards: batch_rewards,\n",
    "                                        self.action_mask: batch_actions} )\n",
    "\n",
    "\n",
    "        if self.num_training_steps % self.target_qnet_update_frequency == 0:\n",
    "\n",
    "            self.session.run( self.hard_copy_to_target )\n",
    "\n",
    "\n",
    "            # Write logs\n",
    "            print 'mean maxQ in minibatch: ',np.mean(np.max(scores,1))\n",
    "\n",
    "            str_ = self.session.run(self.summarize, { self.state: batch_states,\n",
    "                                        self.next_state: batch_newstates,\n",
    "                                        self.next_state_mask: batch_newstates_mask,\n",
    "                                        self.rewards: batch_rewards,\n",
    "                                        self.action_mask: batch_actions})\n",
    "\n",
    "            self.summary_writer.add_summary(str_, self.num_training_steps)\n",
    "\n",
    "\n",
    "        self.num_training_steps += 1\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:anaconda]",
   "language": "python",
   "name": "conda-env-anaconda-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}