# 405 DQN Reinforcement Learning

View more, visit my tutorial page: https://morvanzhou.github.io/tutorials/
My Youtube Channel: https://www.youtube.com/user/MorvanZhou
More about Reinforcement learning: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/

Dependencies:
* torch: 0.1.11
* gym: 0.8.1
* numpy

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import gym

In [2]:
# Hyper Parameters
BATCH_SIZE = 32
LR = 0.01 # learning rate
EPSILON = 0.9 # greedy policy
GAMMA = 0.9 # reward discount
TARGET_REPLACE_ITER = 100 # target update frequency
MEMORY_CAPACITY = 2000
env = gym.make('CartPole-v0')
env = env.unwrapped
N_ACTIONS = env.action_space.n
N_STATES = env.observation_space.shape[0]

[2017-06-20 22:23:40,418] Making new env: CartPole-v0


In [3]:
class Net(nn.Module):
 def __init__(self, ):
 super(Net, self).__init__()
 self.fc1 = nn.Linear(N_STATES, 10)
 self.fc1.weight.data.normal_(0, 0.1) # initialization
 self.out = nn.Linear(10, N_ACTIONS)
 self.out.weight.data.normal_(0, 0.1) # initialization

 def forward(self, x):
 x = self.fc1(x)
 x = F.relu(x)
 actions_value = self.out(x)
 return actions_value

In [4]:
class DQN(object):
 def __init__(self):
 self.eval_net, self.target_net = Net(), Net()

 self.learn_step_counter = 0 # for target updating
 self.memory_counter = 0 # for storing memory
 self.memory = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2)) # initialize memory
 self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)
 self.loss_func = nn.MSELoss()

 def choose_action(self, x):
 x = Variable(torch.unsqueeze(torch.FloatTensor(x), 0))
 # input only one sample
 if np.random.uniform() < EPSILON: # greedy
 actions_value = self.eval_net.forward(x)
 action = torch.max(actions_value, 1)[1].data.numpy()[0, 0] # return the argmax
 else: # random
 action = np.random.randint(0, N_ACTIONS)
 return action

 def store_transition(self, s, a, r, s_):
 transition = np.hstack((s, [a, r], s_))
 # replace the old memory with new memory
 index = self.memory_counter % MEMORY_CAPACITY
 self.memory[index, :] = transition
 self.memory_counter += 1

 def learn(self):
 # target parameter update
 if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
 self.target_net.load_state_dict(self.eval_net.state_dict())
 self.learn_step_counter += 1

 # sample batch transitions
 sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
 b_memory = self.memory[sample_index, :]
 b_s = Variable(torch.FloatTensor(b_memory[:, :N_STATES]))
 b_a = Variable(torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int)))
 b_r = Variable(torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2]))
 b_s_ = Variable(torch.FloatTensor(b_memory[:, -N_STATES:]))

 # q_eval w.r.t the action in experience
 q_eval = self.eval_net(b_s).gather(1, b_a) # shape (batch, 1)
 q_next = self.target_net(b_s_).detach() # detach from graph, don't backpropagate
 q_target = b_r + GAMMA * q_next.max(1)[0] # shape (batch, 1)
 loss = self.loss_func(q_eval, q_target)

 self.optimizer.zero_grad()
 loss.backward()
 self.optimizer.step()

In [5]:
dqn = DQN()

In [6]:

print('\nCollecting experience...')
for i_episode in range(400):
 s = env.reset()
 ep_r = 0
 while True:
 env.render()
 a = dqn.choose_action(s)

 # take action
 s_, r, done, info = env.step(a)

 # modify the reward
 x, x_dot, theta, theta_dot = s_
 r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
 r = r1 + r2

 dqn.store_transition(s, a, r, s_)

 ep_r += r
 if dqn.memory_counter > MEMORY_CAPACITY:
 dqn.learn()
 if done:
 print('Ep: ', i_episode,
 '| Ep_r: ', round(ep_r, 2))

 if done:
 break
 s = s_


Collecting experience...
Ep: 201 | Ep_r: 1.59
Ep: 202 | Ep_r: 4.18
Ep: 203 | Ep_r: 2.73
Ep: 204 | Ep_r: 1.97
Ep: 205 | Ep_r: 1.18
Ep: 206 | Ep_r: 0.86
Ep: 207 | Ep_r: 2.88
Ep: 208 | Ep_r: 1.63
Ep: 209 | Ep_r: 3.91
Ep: 210 | Ep_r: 3.6
Ep: 211 | Ep_r: 0.98
Ep: 212 | Ep_r: 3.85
Ep: 213 | Ep_r: 1.81
Ep: 214 | Ep_r: 2.32
Ep: 215 | Ep_r: 3.75
Ep: 216 | Ep_r: 3.53
Ep: 217 | Ep_r: 4.75
Ep: 218 | Ep_r: 2.4
Ep: 219 | Ep_r: 0.64
Ep: 220 | Ep_r: 1.15
Ep: 221 | Ep_r: 2.3
Ep: 222 | Ep_r: 7.37
Ep: 223 | Ep_r: 1.25
Ep: 224 | Ep_r: 5.02
Ep: 225 | Ep_r: 10.29
Ep: 226 | Ep_r: 17.54
Ep: 227 | Ep_r: 36.2
Ep: 228 | Ep_r: 6.61
Ep: 229 | Ep_r: 10.04
Ep: 230 | Ep_r: 55.19
Ep: 231 | Ep_r: 10.03
Ep: 232 | Ep_r: 13.25
Ep: 233 | Ep_r: 8.75
Ep: 234 | Ep_r: 3.83
Ep: 235 | Ep_r: -0.92
Ep: 236 | Ep_r: 5.12
Ep: 237 | Ep_r: 3.56
Ep: 238 | Ep_r: 5.69
Ep: 239 | Ep_r: 8.43
Ep: 240 | Ep_r: 29.27
Ep: 241 | Ep_r: 17.95
Ep: 242 | Ep_r: 44.77
Ep: 243 | Ep_r: 98.0
Ep: 244 | Ep_r: 38.78
Ep: 245 | Ep_r: 45.02
Ep: 246 | Ep_r: 27.7