""" An agent that learns to balance the gym's cartpole by: - encoding its state in a not too sparse small SDR - using a pair of ValueMap-s to map danger scores for left-right actions. The algorithm is a form of simple Q-Learning: - A pair of SDR value maps are used to read the fear of moving LEFT or RIGHT - At every step agent queries the two fear maps and picks the least scary move or moves randomly if both choices are similarily dangerous - Hooray (and lived happily ever after) happens when episode reaches maximum number of steps. - Death happens when episode ends prematurely - Only after death fear maps are updated with nightmares - It ignores gym's reward and penalizes the past N steps leading to death with an incremental penalty - such the closer to death the higher the penalty - For example: if step[-N+5] was LEFT, then it adds 5 with SDR encoding state[-N+5] to the LEFT fear map - So fear maps are incremented only by near death experiences, the closer to death is an experience, a higher a fear is added. Results: With the right parameters(*) it learns to survive in under 50 deaths. (*) sdr encoding should not be too large nor too sparse. Each state parameter uses 7 bits out of 50 (14% sparsity) Comments: The algorithm is similar to Q-Tables with the peculiarity that a SDR ValueMap highlights significant correlations between different properties within dataset. e.g - the association of being far right AND having high speed toward right will get a high fear score regardless of what About complaining the algorithm uses a hand crafted reward instead of the one received from the cartpole's environment, Sure, but it also means: - reward policy is really important on how well&fast an agent learns - after all, no real-life RL agent will be deployed by simply training it with a sloppy policy - local, graded rewards might be more useful than either very large sparse ones or tiny constant ones. - a trully intelligent agent would learn how to pick its own local rewards - The policies invented by nature - like fear - might be useful. Copyright & Disclaimer: Use the code as you wish, Do not blame me for hurting your pole """ import numpy as np from time import time import gym, random, numba from sdr_value_map import ValueCorrMap # The magic ingredient SDR_SIZE = 100 SDR_BITS = 4 # Number of ON bits for each state parameter STATE_SIZE = 4 # How many parameters are within a state LEFT,RIGHT = 0,1 NUM_EPISODES = 1000 EMPTY_SDR = np.array([], dtype = np.uint32) @numba.jit def sdr_encoder(state, minims, maxims): sdr_starts = ( np.arange(STATE_SIZE) * SDR_SIZE // STATE_SIZE).astype(np.uint32) sdr_widths = np.array([sdr_starts[1]-SDR_BITS] * 4, dtype = np.uint32) sdr_bits = np.array([SDR_BITS]*STATE_SIZE, dtype = np.uint32) sdr = [] ranges = (maxims - minims) values = state - minims # print(ranges, values) for i, val in enumerate(state): rang, val = ranges[i], values[i] width = int(sdr_widths[i] * val / rang) for k in range(sdr_bits[i]): sdr.append(width + k + sdr_starts[i]) return np.array(sdr, dtype=np.uint32) @numba.jit def min_max_adjust(state, minims, maxims): where = np.where(state < minims) minims[where] = state[where] where = np.where(state > maxims) maxims[where] = state[where] class SDR_Proxy_Player: # This is a wrapper over an actual player. # its purpose is to convert the float representation to SDR # (SDR encoder). # Its peculiarity vs usual scalar encoders is it dynamically # adjusts its range to map efficiently the state space into SDR space # def __init__(self, player): # initial values for all state parameters are within -0.1 +0.1 range self.maxims = 0.1 * np.ones(4) self.minims = - self.maxims.copy() self.player = player def policy(self, state, reward, done): min_max_adjust(state, self.minims, self.maxims) sdr = sdr_encoder(state, self.minims, self.maxims) return self.player.policy(sdr,reward,done) class AvoidantPlayer(): """ As I probably mentioned its survival strategy is fear: Every time it dies the state-action pairs preceding its death are saved into two fear maps: one ValueCorrMap encodes fear to move left and the other one to move right. The closer to dying state the higher the fear level. """ def __init__(self): self.dangers = [ValueCorrMap(sdr_size = SDR_SIZE) for _ in (LEFT,RIGHT)] self.new_game() def new_game(self): self.steps = [] self.reward = 0 def least_danger(self,sdr): # returns which action is least dangerous except t # when they-re so close that it picks randomly scores = [danger.score(sdr) for danger in self.dangers] e = 0.00001 if max(scores) / (min(scores)+e) < 1.01: return random.randint(0,1) return scores[LEFT] > scores[RIGHT] def policy(self,sdr,reward,done): action = self.least_danger(sdr) self.steps.append((sdr, action)) self.reward += reward if done : self.update_dangers() self.new_game() return action def update_dangers(self): # Now the game is ended # This is where learning happens: if self.reward > 490: # consider > 490 points as success print("yup! ", end = '') return # nothing scary happened, don't record anything danger = 18 while danger and len(self.steps): sdr, action = self.steps.pop() self.dangers[action].add(sdr, danger) danger -= 1 def cartpole_play(policy, n_episodes): cp = gym.make('CartPole-v1') for ep in range(n_episodes): state = cp.reset() istate = state done = False reward = 0 steps = [] rtotal = 0 while True: action = policy(state, reward, done) if done: break state, reward, done, info = cp.step(action) cp.render() steps.append((action, state, reward, done)) rtotal += reward yield ep, rtotal,istate, steps player = SDR_Proxy_Player(AvoidantPlayer()) t = time() total_steps = 0 hoorays = 0 for ep,total_reward,istate,steps in cartpole_play(player.policy, NUM_EPISODES): print(f"{ep+1:4d}: steps: {len(steps):3d}") if len(steps) == 500: hoorays += 1 total_steps += len(steps) t = int((time() - t)*1000) print(f"Play {total_steps} steps in {t}ms, hoorays:{hoorays}")