{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-22-frozenlake-crossentropy.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/T163940%20%7C%20FrozenLake%20using%20Cross-Entropy.ipynb","timestamp":1644661987980},{"file_id":"1WY2clU9Vi9fStD9TpZMYDMAytcSlFufw","timestamp":1634476338091}],"collapsed_sections":[],"toc_visible":true,"authorship_tag":"ABX9TyPqveZSzDpl8DRSYI64+UkH"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","metadata":{"id":"VWxxs59VW5Ze"},"source":["# FrozenLake using Cross-Entropy"]},{"cell_type":"markdown","metadata":{"id":"cHm4lfOcHcTJ"},"source":["The next environment that we will try to solve using the cross-entropy method is FrozenLake. Its world is from the so-called grid world category, when your agent lives in a grid of size 4×4 and can move in four directions: up, down, left, and right. The agent always starts at a top-left position, and its goal is to reach the bottom-right cell of the grid. There are holes in the fixed cells of the grid and if you get into those holes, the episode ends and your reward is zero. If the agent reaches the destination cell, then it obtains a reward of 1.0 and the episode ends.\n","\n","To make life more complicated, the world is slippery (it's a frozen lake after all), so the agent's actions do not always turn out as expected—there is a 33% chance that it will slip to the right or to the left. If you want the agent to move left, for example, there is a 33% probability that it will, indeed, move left, a 33% chance that it will end up in the cell above, and a 33% chance that it will end up in the cell below. As you will see at the end of the section, this makes progress difficult."]},{"cell_type":"markdown","metadata":{"id":"4rvZWsyoYq61"},"source":["## Imports"]},{"cell_type":"code","metadata":{"id":"iadtqdDMXEmc"},"source":["import gym\n","import gym.spaces\n","import gym.wrappers\n","import gym.envs.toy_text.frozen_lake\n","\n","from collections import namedtuple\n","import numpy as np\n","import random\n","from torch.utils.tensorboard import SummaryWriter\n","\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"-mVebvwpZHLW"},"source":["## Naive method"]},{"cell_type":"markdown","metadata":{"id":"j4CDy2JhYsVt"},"source":["### Params"]},{"cell_type":"code","metadata":{"id":"gqzDLuAEXKnP"},"source":["HIDDEN_SIZE = 128\n","BATCH_SIZE = 16\n","PERCENTILE = 70"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"HOcxMHUYZI4s"},"source":["### Model"]},{"cell_type":"code","metadata":{"id":"qifV793qZNop"},"source":["class DiscreteOneHotWrapper(gym.ObservationWrapper):\n"," def __init__(self, env):\n"," super(DiscreteOneHotWrapper, self).__init__(env)\n"," assert isinstance(env.observation_space,\n"," gym.spaces.Discrete)\n"," shape = (env.observation_space.n, )\n"," self.observation_space = gym.spaces.Box(\n"," 0.0, 1.0, shape, dtype=np.float32)\n","\n"," def observation(self, observation):\n"," res = np.copy(self.observation_space.low)\n"," res[observation] = 1.0\n"," return res\n","\n","\n","class Net(nn.Module):\n"," def __init__(self, obs_size, hidden_size, n_actions):\n"," super(Net, self).__init__()\n"," self.net = nn.Sequential(\n"," nn.Linear(obs_size, hidden_size),\n"," nn.ReLU(),\n"," nn.Linear(hidden_size, n_actions)\n"," )\n","\n"," def forward(self, x):\n"," return self.net(x)\n","\n","\n","Episode = namedtuple('Episode', field_names=['reward', 'steps'])\n","EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])\n","\n","\n","def iterate_batches(env, net, batch_size):\n"," batch = []\n"," episode_reward = 0.0\n"," episode_steps = []\n"," obs = env.reset()\n"," sm = nn.Softmax(dim=1)\n"," while True:\n"," obs_v = torch.FloatTensor([obs])\n"," act_probs_v = sm(net(obs_v))\n"," act_probs = act_probs_v.data.numpy()[0]\n"," action = np.random.choice(len(act_probs), p=act_probs)\n"," next_obs, reward, is_done, _ = env.step(action)\n"," episode_reward += reward\n"," episode_steps.append(EpisodeStep(observation=obs, action=action))\n"," if is_done:\n"," batch.append(Episode(reward=episode_reward, steps=episode_steps))\n"," episode_reward = 0.0\n"," episode_steps = []\n"," next_obs = env.reset()\n"," if len(batch) == batch_size:\n"," yield batch\n"," batch = []\n"," obs = next_obs\n","\n","\n","def filter_batch(batch, percentile):\n"," rewards = list(map(lambda s: s.reward, batch))\n"," reward_bound = np.percentile(rewards, percentile)\n"," reward_mean = float(np.mean(rewards))\n","\n"," train_obs = []\n"," train_act = []\n"," for example in batch:\n"," if example.reward < reward_bound:\n"," continue\n"," train_obs.extend(map(lambda step: step.observation, example.steps))\n"," train_act.extend(map(lambda step: step.action, example.steps))\n","\n"," train_obs_v = torch.FloatTensor(train_obs)\n"," train_act_v = torch.LongTensor(train_act)\n"," return train_obs_v, train_act_v, reward_bound, reward_mean"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"N4e8DX4DYvkP"},"source":["### Run"]},{"cell_type":"code","metadata":{"id":"Iqz4Ma8ZXUgF"},"source":["if __name__ == \"__main__\":\n"," env = DiscreteOneHotWrapper(gym.make(\"FrozenLake-v0\"))\n"," # env = gym.wrappers.Monitor(env, directory=\"mon\", force=True)\n"," obs_size = env.observation_space.shape[0]\n"," n_actions = env.action_space.n\n","\n"," net = Net(obs_size, HIDDEN_SIZE, n_actions)\n"," objective = nn.CrossEntropyLoss()\n"," optimizer = optim.Adam(params=net.parameters(), lr=0.01)\n"," writer = SummaryWriter(comment=\"-frozenlake-naive\")\n","\n"," for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):\n"," obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)\n"," optimizer.zero_grad()\n"," action_scores_v = net(obs_v)\n"," loss_v = objective(action_scores_v, acts_v)\n"," loss_v.backward()\n"," optimizer.step()\n"," print(\"%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f\" % (\n"," iter_no, loss_v.item(), reward_m, reward_b))\n"," writer.add_scalar(\"loss\", loss_v.item(), iter_no)\n"," writer.add_scalar(\"reward_bound\", reward_b, iter_no)\n"," writer.add_scalar(\"reward_mean\", reward_m, iter_no)\n"," if reward_m > 0.8:\n"," print(\"Solved!\")\n"," break\n"," writer.close()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"F2660IBXZiYN"},"source":["## Tweaked method"]},{"cell_type":"markdown","metadata":{"id":"l1-_XbSmZmvD"},"source":["### Params"]},{"cell_type":"code","metadata":{"id":"byKK0wRTZwDu"},"source":["HIDDEN_SIZE = 128\n","BATCH_SIZE = 100\n","PERCENTILE = 30\n","GAMMA = 0.9"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"kc31rOrbZrLe"},"source":["### Model"]},{"cell_type":"code","metadata":{"id":"N3MOfJXtZz-g"},"source":["class DiscreteOneHotWrapper(gym.ObservationWrapper):\n"," def __init__(self, env):\n"," super(DiscreteOneHotWrapper, self).__init__(env)\n"," assert isinstance(env.observation_space, gym.spaces.Discrete)\n"," self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)\n","\n"," def observation(self, observation):\n"," res = np.copy(self.observation_space.low)\n"," res[observation] = 1.0\n"," return res\n","\n","\n","class Net(nn.Module):\n"," def __init__(self, obs_size, hidden_size, n_actions):\n"," super(Net, self).__init__()\n"," self.net = nn.Sequential(\n"," nn.Linear(obs_size, hidden_size),\n"," nn.ReLU(),\n"," nn.Linear(hidden_size, n_actions)\n"," )\n","\n"," def forward(self, x):\n"," return self.net(x)\n","\n","\n","Episode = namedtuple('Episode', field_names=['reward', 'steps'])\n","EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])\n","\n","\n","def iterate_batches(env, net, batch_size):\n"," batch = []\n"," episode_reward = 0.0\n"," episode_steps = []\n"," obs = env.reset()\n"," sm = nn.Softmax(dim=1)\n"," while True:\n"," obs_v = torch.FloatTensor([obs])\n"," act_probs_v = sm(net(obs_v))\n"," act_probs = act_probs_v.data.numpy()[0]\n"," action = np.random.choice(len(act_probs), p=act_probs)\n"," next_obs, reward, is_done, _ = env.step(action)\n"," episode_reward += reward\n"," episode_steps.append(EpisodeStep(observation=obs, action=action))\n"," if is_done:\n"," batch.append(Episode(reward=episode_reward, steps=episode_steps))\n"," episode_reward = 0.0\n"," episode_steps = []\n"," next_obs = env.reset()\n"," if len(batch) == batch_size:\n"," yield batch\n"," batch = []\n"," obs = next_obs\n","\n","\n","def filter_batch(batch, percentile):\n"," filter_fun = lambda s: s.reward * (GAMMA ** len(s.steps))\n"," disc_rewards = list(map(filter_fun, batch))\n"," reward_bound = np.percentile(disc_rewards, percentile)\n","\n"," train_obs = []\n"," train_act = []\n"," elite_batch = []\n"," for example, discounted_reward in zip(batch, disc_rewards):\n"," if discounted_reward > reward_bound:\n"," train_obs.extend(map(lambda step: step.observation,\n"," example.steps))\n"," train_act.extend(map(lambda step: step.action,\n"," example.steps))\n"," elite_batch.append(example)\n","\n"," return elite_batch, train_obs, train_act, reward_bound"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"xsRuD07BZrv_"},"source":["### Run"]},{"cell_type":"code","metadata":{"id":"ZHQU4R8rZmsh"},"source":["if __name__ == \"__main__\":\n"," random.seed(12345)\n"," env = DiscreteOneHotWrapper(gym.make(\"FrozenLake-v0\"))\n"," # env = gym.wrappers.Monitor(env, directory=\"mon\", force=True)\n"," obs_size = env.observation_space.shape[0]\n"," n_actions = env.action_space.n\n","\n"," net = Net(obs_size, HIDDEN_SIZE, n_actions)\n"," objective = nn.CrossEntropyLoss()\n"," optimizer = optim.Adam(params=net.parameters(), lr=0.001)\n"," writer = SummaryWriter(comment=\"-frozenlake-tweaked\")\n","\n"," full_batch = []\n"," for iter_no, batch in enumerate(iterate_batches(\n"," env, net, BATCH_SIZE)):\n"," reward_mean = float(np.mean(list(map(\n"," lambda s: s.reward, batch))))\n"," full_batch, obs, acts, reward_bound = \\\n"," filter_batch(full_batch + batch, PERCENTILE)\n"," if not full_batch:\n"," continue\n"," obs_v = torch.FloatTensor(obs)\n"," acts_v = torch.LongTensor(acts)\n"," full_batch = full_batch[-500:]\n","\n"," optimizer.zero_grad()\n"," action_scores_v = net(obs_v)\n"," loss_v = objective(action_scores_v, acts_v)\n"," loss_v.backward()\n"," optimizer.step()\n"," print(\"%d: loss=%.3f, rw_mean=%.3f, \"\n"," \"rw_bound=%.3f, batch=%d\" % (\n"," iter_no, loss_v.item(), reward_mean,\n"," reward_bound, len(full_batch)))\n"," writer.add_scalar(\"loss\", loss_v.item(), iter_no)\n"," writer.add_scalar(\"reward_mean\", reward_mean, iter_no)\n"," writer.add_scalar(\"reward_bound\", reward_bound, iter_no)\n"," if reward_mean > 0.8:\n"," print(\"Solved!\")\n"," break\n"," writer.close()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"TUlp9A3OZmqv"},"source":["## Non-slippery method"]},{"cell_type":"markdown","metadata":{"id":"AXxoXim1Zmpb"},"source":["### Params"]},{"cell_type":"code","metadata":{"id":"OITP8dr6ZmmQ"},"source":["HIDDEN_SIZE = 128\n","BATCH_SIZE = 100\n","PERCENTILE = 30\n","GAMMA = 0.9"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_C6bU29kZmkN"},"source":["### Model"]},{"cell_type":"code","metadata":{"id":"Q00QDuQoaULb"},"source":["class DiscreteOneHotWrapper(gym.ObservationWrapper):\n"," def __init__(self, env):\n"," super(DiscreteOneHotWrapper, self).__init__(env)\n"," assert isinstance(env.observation_space, gym.spaces.Discrete)\n"," self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)\n","\n"," def observation(self, observation):\n"," res = np.copy(self.observation_space.low)\n"," res[observation] = 1.0\n"," return res\n","\n","\n","class Net(nn.Module):\n"," def __init__(self, obs_size, hidden_size, n_actions):\n"," super(Net, self).__init__()\n"," self.net = nn.Sequential(\n"," nn.Linear(obs_size, hidden_size),\n"," nn.ReLU(),\n"," nn.Linear(hidden_size, n_actions)\n"," )\n","\n"," def forward(self, x):\n"," return self.net(x)\n","\n","\n","Episode = namedtuple('Episode', field_names=['reward', 'steps'])\n","EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])\n","\n","\n","def iterate_batches(env, net, batch_size):\n"," batch = []\n"," episode_reward = 0.0\n"," episode_steps = []\n"," obs = env.reset()\n"," sm = nn.Softmax(dim=1)\n"," while True:\n"," obs_v = torch.FloatTensor([obs])\n"," act_probs_v = sm(net(obs_v))\n"," act_probs = act_probs_v.data.numpy()[0]\n"," action = np.random.choice(len(act_probs), p=act_probs)\n"," next_obs, reward, is_done, _ = env.step(action)\n"," episode_reward += reward\n"," episode_steps.append(EpisodeStep(observation=obs, action=action))\n"," if is_done:\n"," batch.append(Episode(reward=episode_reward, steps=episode_steps))\n"," episode_reward = 0.0\n"," episode_steps = []\n"," next_obs = env.reset()\n"," if len(batch) == batch_size:\n"," yield batch\n"," batch = []\n"," obs = next_obs\n","\n","\n","def filter_batch(batch, percentile):\n"," disc_rewards = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), batch))\n"," reward_bound = np.percentile(disc_rewards, percentile)\n","\n"," train_obs = []\n"," train_act = []\n"," elite_batch = []\n"," for example, discounted_reward in zip(batch, disc_rewards):\n"," if discounted_reward > reward_bound:\n"," train_obs.extend(map(lambda step: step.observation, example.steps))\n"," train_act.extend(map(lambda step: step.action, example.steps))\n"," elite_batch.append(example)\n","\n"," return elite_batch, train_obs, train_act, reward_bound"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"SySYj3j7aUH3"},"source":["### Run"]},{"cell_type":"code","metadata":{"id":"pKCWKPo3aUFp"},"source":["if __name__ == \"__main__\":\n"," random.seed(12345)\n"," env = gym.envs.toy_text.frozen_lake.FrozenLakeEnv(\n"," is_slippery=False)\n"," env.spec = gym.spec(\"FrozenLake-v0\")\n"," env = gym.wrappers.TimeLimit(env, max_episode_steps=100)\n"," env = DiscreteOneHotWrapper(env)\n"," # env = gym.wrappers.Monitor(env, directory=\"mon\", force=True)\n"," obs_size = env.observation_space.shape[0]\n"," n_actions = env.action_space.n\n","\n"," net = Net(obs_size, HIDDEN_SIZE, n_actions)\n"," objective = nn.CrossEntropyLoss()\n"," optimizer = optim.Adam(params=net.parameters(), lr=0.001)\n"," writer = SummaryWriter(comment=\"-frozenlake-nonslippery\")\n","\n"," full_batch = []\n"," for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):\n"," reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))\n"," full_batch, obs, acts, reward_bound = filter_batch(full_batch + batch, PERCENTILE)\n"," if not full_batch:\n"," continue\n"," obs_v = torch.FloatTensor(obs)\n"," acts_v = torch.LongTensor(acts)\n"," full_batch = full_batch[-500:]\n","\n"," optimizer.zero_grad()\n"," action_scores_v = net(obs_v)\n"," loss_v = objective(action_scores_v, acts_v)\n"," loss_v.backward()\n"," optimizer.step()\n"," print(\"%d: loss=%.3f, reward_mean=%.3f, reward_bound=%.3f, batch=%d\" % (\n"," iter_no, loss_v.item(), reward_mean, reward_bound, len(full_batch)))\n"," writer.add_scalar(\"loss\", loss_v.item(), iter_no)\n"," writer.add_scalar(\"reward_mean\", reward_mean, iter_no)\n"," writer.add_scalar(\"reward_bound\", reward_bound, iter_no)\n"," if reward_mean > 0.8:\n"," print(\"Solved!\")\n"," break\n"," writer.close()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"EIH6m2kzaYFy"},"source":["## Visualization"]},{"cell_type":"code","metadata":{"id":"mqdYU6qhXYg2"},"source":["%load_ext tensorboard\n","%tensorboard --logdir runs"],"execution_count":null,"outputs":[]}]}