import random import numpy as np from qmaze import Qmaze, show, completion_check class Experience(object): def __init__(self, model, max_memory=100, discount=0.95): self.model = model self.max_memory = max_memory self.discount = discount self.memory = list() self.num_actions = model.output_shape[-1] def remember(self, episode): # episode = [envstate, action, reward, envstate_next, game_over] # memory[i] = episode # envstate == flattened 1d maze cells info, including rat cell (see method: observe) self.memory.append(episode) if len(self.memory) > self.max_memory: del self.memory[0] def predict(self, envstate): return self.model.predict(envstate)[0] def get_data(self, data_size=10): env_size = self.memory[0][0].shape[1] # envstate 1d size (1st element of episode) mem_size = len(self.memory) data_size = min(mem_size, data_size) inputs = np.zeros((data_size, env_size)) targets = np.zeros((data_size, self.num_actions)) for i, j in enumerate(np.random.choice(range(mem_size), data_size, replace=False)): envstate, action, reward, envstate_next, game_over = self.memory[j] inputs[i] = envstate # There should be no target values for actions not taken. targets[i] = self.predict(envstate) # Q_sa = derived policy = max quality env/action = max_a' Q(s', a') Q_sa = np.max(self.predict(envstate_next)) if game_over: targets[i, action] = reward else: # reward + gamma * max_a' Q(s', a') targets[i, action] = reward + self.discount * Q_sa return inputs, targets # Exploration factor epsilon = 0.1 def qtrain(model, maze, **opt): global epsilon n_epoch = opt.get('n_epoch', 15000) max_memory = opt.get('max_memory', 1000) data_size = opt.get('data_size', 50) weights_file = opt.get('weights_file', "") name = opt.get('name', 'model') visualize = opt.get('visualize', False) # If you want to continue training from a previous model, # just supply the h5 file name to weights_file option if weights_file: print("loading weights from file: %s" % (weights_file,)) model.load_weights(weights_file) # Construct environment/game from numpy array: maze (see above) qmaze = Qmaze(maze) # Initialize experience replay object experience = Experience(model, max_memory=max_memory) win_history = [] # history of win/lose game hsize = qmaze.maze.size//2 # history window size win_rate = 0.0 for epoch in range(n_epoch): rat_cell = random.choice(qmaze.free_cells) # or (0, 0) qmaze.reset(rat_cell) game_over = False # get initial envstate (1d flattened canvas) envstate = qmaze.observe() n_episodes = 0 while not game_over: valid_actions = qmaze.valid_actions() if not valid_actions: break prev_envstate = envstate # Get next action if np.random.rand() < epsilon: action = random.choice(valid_actions) else: action = np.argmax(experience.predict(prev_envstate)) # Apply action, get reward and new envstate envstate, reward, game_status = qmaze.act(action) if visualize: show(qmaze) if game_status == 'win': win_history.append(1) game_over = True elif game_status == 'lose': win_history.append(0) game_over = True else: game_over = False # Store episode (experience) episode = [prev_envstate, action, reward, envstate, game_over] experience.remember(episode) n_episodes += 1 # Train neural network model inputs, targets = experience.get_data(data_size=data_size) model.fit(inputs, targets, epochs=8, batch_size=16, verbose=0) loss = model.evaluate(inputs, targets, verbose=0) # Print stats print("Epoch %d/%d | Loss: %.2f | Episodes: %d | Win count: %d" %(epoch+1, n_epoch, loss, n_episodes, sum(win_history))) if len(win_history) > hsize: win_rate = sum(win_history[-hsize:]) / hsize # we simply check if training has exhausted all free cells and if in all # cases the agent won if win_rate > 0.9 : epsilon = 0.05 if sum(win_history[-hsize:]) == hsize and completion_check(model, qmaze): print("Reached 100%% win rate at epoch: %d" % (epoch,)) break return model