import numpy as np from qcatch import display_screen class ExperienceReplay(object): """ During gameplay all the experiences < s, a, r, s’ > are stored in a replay memory. In training, batches of randomly drawn experiences are used to generate the input and target for training. """ def __init__(self, max_memory=100, discount=.9): """ Setup max_memory: the maximum number of experiences we want to store memory: a list of experiences discount: the discount factor for future experience In the memory the information whether the game ended at the state is stored seperately in a nested array [... [experience, game_over] [experience, game_over] ...] """ self.max_memory = max_memory self.memory = list() self.discount = discount def remember(self, states, game_over): #Save a state to memory self.memory.append([states, game_over]) #We don't want to store infinite memories, so if we have too many, we just delete the oldest one if len(self.memory) > self.max_memory: del self.memory[0] def get_batch(self, model, batch_size=10): #How many experiences do we have? len_memory = len(self.memory) #Calculate the number of actions that can possibly be taken in the game num_actions = model.output_shape[-1] #Dimensions of the game field env_dim = self.memory[0][0][0].shape[1] #We want to return an input and target vector with inputs from an observed state... inputs = np.zeros((min(len_memory, batch_size), env_dim)) #...and the target r + gamma * max Q(s’,a’) #Note that our target is a matrix, with possible fields not only for the action taken but also #for the other possible actions. The actions not take the same value as the prediction to not affect them targets = np.zeros((inputs.shape[0], num_actions)) #We draw states to learn from randomly for i, idx in enumerate(np.random.randint(0, len_memory, size=inputs.shape[0])): """ Here we load one transition from memory state_t: initial state s action_t: action taken a reward_t: reward earned r state_tp1: the state that followed s’ """ state_t, action_t, reward_t, state_tp1 = self.memory[idx][0] #We also need to know whether the game ended at this state game_over = self.memory[idx][1] #add the state s to the input inputs[i:i+1] = state_t # First we fill the target values with the predictions of the model. # They will not be affected by training (since the training loss for them is 0) targets[i] = model.predict(state_t)[0] """ If the game ended, the expected reward Q(s,a) should be the final reward r. Otherwise the target value is r + gamma * max Q(s’,a’) """ # Here Q_sa is max_a'Q(s', a') Q_sa = np.max(model.predict(state_tp1)[0]) #if the game ended, the reward is the final reward if game_over: # if game_over is True targets[i, action_t] = reward_t else: # r + gamma * max Q(s’,a’) targets[i, action_t] = reward_t + self.discount * Q_sa return inputs, targets epsilon = .1 # exploration def train(model, epochs, env, max_memory, batch_size, verbose = 1, visualize = False): exp_replay = ExperienceReplay(max_memory=max_memory) # Train #Reseting the win counter win_cnt = 0 # We want to keep track of the progress of the AI over time, so we save its win count history win_hist = [] #Epochs is the number of games we play for e in range(epochs): loss = 0. #Resetting the game env.reset() game_over = False # get initial input input_t = env.observe() while not game_over: #The learner is acting on the last observed game screen #input_t is a vector containing representing the game screen input_tm1 = input_t if np.random.rand() <= epsilon: # Select random move action = np.random.randint(0, 3, size=1) # random action else: # Select best move # q contains the expected rewards for the actions q = model.predict(input_tm1) # we pick the action with the highest expected reward action = np.argmax(q[0]) # apply action, get rewards and new state input_t, reward, game_over = env.act(action) # If we managed to catch the fruit we add 1 to our win counter if reward == 1: win_cnt += 1 if visualize: display_screen(action, reward, input_t) """ The experiences < s, a, r, s’ > we make during gameplay are our training data. Here we first save the last experience, and then load a batch of experiences to train our model """ # store experience exp_replay.remember([input_tm1, action, reward, input_t], game_over) # Load batch of experiences inputs, targets = exp_replay.get_batch(model, batch_size=batch_size) # train model on experiences batch_loss = model.train_on_batch(inputs, targets) #print(loss) loss += batch_loss if verbose > 0: print("Epoch {:03d}/{:03d} | Loss {:.4f} | Win count {}".format(e,epochs, loss, win_cnt)) win_hist.append(win_cnt) return model def test(model, env, visualize = True): #This function lets a pretrained model play the game to evaluate how well it is doing global last_frame_time #plt.ion() #c is a simple counter variable keeping track of how much we train c = 0 #Reset the last frame time (we are starting from 0) last_frame_time = 0 #Reset score points = 0 #For training we are playing the game 10 times for e in range(10): #Reset the game env.reset() #The game is not over game_over = False # get initial input input_t = env.observe() #display_screen(3,points,input_t) c += 1 while not game_over: #The learner is acting on the last observed game screen #input_t is a vector containing representing the game screen input_tm1 = input_t #Feed the learner the current status and get the expected rewards for different actions from it q = model.predict(input_tm1) #Select the action with the highest expected reward action = np.argmax(q[0]) # apply action, get rewards and new state input_t, reward, game_over = env.act(action) #Update our score points += reward if visualize: display_screen(action,points,input_t) c += 1