{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-20-rl-social.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/T098537%20%7C%20Building%20an%20RL%20Agent%20to%20manage%20social%20media%20accounts%20on%20the%20web.ipynb","timestamp":1644653336407},{"file_id":"12No3I0gmuettQ3DX5TXZI69FsjgFqTKm","timestamp":1638512065762},{"file_id":"1G-E2x6mzYYG8pvbgbHRO3QTM68fN9yQs","timestamp":1638511589199},{"file_id":"1P927fpgrxOd_nYl_ivXhz-yPLGUyyWjZ","timestamp":1638510924062},{"file_id":"1kbq3C9K_CcdkD4nGqRKBO_QRuIFc3CKa","timestamp":1638508636433}],"collapsed_sections":[],"toc_visible":true,"mount_file_id":"1kbq3C9K_CcdkD4nGqRKBO_QRuIFc3CKa","authorship_tag":"ABX9TyMcyvJV6hf/34Rb0ZkZcjR0"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","metadata":{"id":"LhETGbFeyexq"},"source":["# Building an RL Agent to manage social media accounts on the web"]},{"cell_type":"code","metadata":{"id":"fsn3Y3yObopB"},"source":["!wget -q --show-progress https://github.com/RecoHut-Projects/drl-recsys/raw/S990517/tools/webgym.zip\n","!unzip webgym.zip"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"Ct-TsEZqajVL"},"source":["!pip install selenium\n","!apt-get update # to update ubuntu to correctly run apt install\n","!apt install chromium-chromedriver\n","!cp /usr/lib/chromium-browser/chromedriver /usr/bin"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"I5zXnSjmeOJl"},"source":["import sys\n","sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"6aCFmU0TjUIA"},"source":["import webgym"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"_ODJjGqoa5ml"},"source":["import argparse\n","import os\n","import copy\n","import random\n","from collections import deque\n","from datetime import datetime\n","\n","import gym\n","import numpy as np\n","import tensorflow as tf\n","from tensorflow.keras.layers import (\n"," Conv2D,\n"," Dense,\n"," Dropout,\n"," Flatten,\n"," Input,\n"," Lambda,\n"," MaxPool2D,\n"," concatenate,\n",")"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"2SsdztX_pc6j"},"source":["%load_ext tensorboard"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"lxcyEZCVdYN-"},"source":["tf.keras.backend.set_floatx(\"float64\")"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"tnLHxGIj8iYF"},"source":["## Social Media Like Reply Agent"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"B8YIBwEudcT9","executionInfo":{"status":"ok","timestamp":1638512367533,"user_tz":-330,"elapsed":483,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"e0c35986-51af-4fea-e8b0-fb4ac4b07841"},"source":["parser = argparse.ArgumentParser(prog=\"TFRL-SocialMedia-Like-Reply-Agent\")\n","parser.add_argument(\"--env\", default=\"MiniWoBSocialMediaReplyVisualEnv-v0\")\n","parser.add_argument(\"--update-freq\", type=int, default=16)\n","parser.add_argument(\"--epochs\", type=int, default=3)\n","parser.add_argument(\"--actor-lr\", type=float, default=1e-4)\n","parser.add_argument(\"--critic-lr\", type=float, default=1e-4)\n","parser.add_argument(\"--clip-ratio\", type=float, default=0.1)\n","parser.add_argument(\"--gae-lambda\", type=float, default=0.95)\n","parser.add_argument(\"--gamma\", type=float, default=0.99)\n","parser.add_argument(\"--logdir\", default=\"logs\")\n","\n","args = parser.parse_args([])\n","logdir = os.path.join(\n"," args.logdir, parser.prog, args.env, datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",")\n","print(f\"Saving training logs to:{logdir}\")\n","writer = tf.summary.create_file_writer(logdir)"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Saving training logs to:logs/TFRL-SocialMedia-Like-Reply-Agent/MiniWoBSocialMediaReplyVisualEnv-v0/20211203-061930\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"kz9QGt6Wdd6c","executionInfo":{"status":"ok","timestamp":1638512499116,"user_tz":-330,"elapsed":51844,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"218c6b60-3de3-4d59-b54b-179656f7226b"},"source":["class Actor:\n"," def __init__(self, state_dim, action_dim, action_bound, std_bound):\n"," self.state_dim = state_dim\n"," self.action_dim = action_dim\n"," self.action_bound = np.array(action_bound)\n"," self.std_bound = std_bound\n"," self.weight_initializer = tf.keras.initializers.he_normal()\n"," self.eps = 1e-5\n"," self.model = self.nn_model()\n"," self.model.summary() # Print a summary of the Actor model\n"," self.opt = tf.keras.optimizers.Nadam(args.actor_lr)\n","\n"," def nn_model(self):\n"," obs_input = Input(self.state_dim, name=\"im_obs\")\n"," conv1 = Conv2D(\n"," filters=64,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"same\",\n"," input_shape=self.state_dim,\n"," data_format=\"channels_last\",\n"," activation=\"relu\",\n"," )(obs_input)\n"," pool1 = MaxPool2D(pool_size=(3, 3), strides=1)(conv1)\n"," conv2 = Conv2D(\n"," filters=32,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool1)\n"," pool2 = MaxPool2D(pool_size=(3, 3), strides=1)(conv2)\n"," conv3 = Conv2D(\n"," filters=16,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool2)\n"," pool3 = MaxPool2D(pool_size=(3, 3), strides=1)(conv3)\n"," conv4 = Conv2D(\n"," filters=8,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool3)\n"," pool4 = MaxPool2D(pool_size=(3, 3), strides=1)(conv4)\n"," flat = Flatten()(pool4)\n"," dense1 = Dense(\n"," 16, activation=\"relu\", kernel_initializer=self.weight_initializer\n"," )(flat)\n"," dropout1 = Dropout(0.3)(dense1)\n"," dense2 = Dense(\n"," 8, activation=\"relu\", kernel_initializer=self.weight_initializer\n"," )(dropout1)\n"," dropout2 = Dropout(0.3)(dense2)\n"," # action_dim[0] = 2\n"," output_val = Dense(\n"," self.action_dim[0],\n"," activation=\"relu\",\n"," kernel_initializer=self.weight_initializer,\n"," )(dropout2)\n"," # Scale & clip x[i] to be in range [0, action_bound[i]]\n"," action_bound = copy.deepcopy(self.action_bound)\n"," mu_output = Lambda(\n"," lambda x: tf.clip_by_value(x * action_bound, 1e-9, action_bound),\n"," name=\"mu_output\",\n"," )(output_val)\n"," std_output_1 = Dense(\n"," self.action_dim[0],\n"," activation=\"softplus\",\n"," kernel_initializer=self.weight_initializer,\n"," )(dropout2)\n"," std_output = Lambda(\n"," lambda x: tf.clip_by_value(\n"," x * action_bound, 1e-9, action_bound / 2, name=\"std_output\"\n"," )\n"," )(std_output_1)\n"," return tf.keras.models.Model(\n"," inputs=obs_input, outputs=[mu_output, std_output], name=\"Actor\"\n"," )\n","\n"," def get_action(self, state):\n"," # Convert [Image] to np.array(np.adarray)\n"," state_np = np.array([np.array(s) for s in state])\n"," if len(state_np.shape) == 3:\n"," # Convert (w, h, c) to (1, w, h, c)\n"," state_np = np.expand_dims(state_np, 0)\n"," mu, std = self.model.predict(state_np)\n"," action = np.random.normal(mu[0], std[0] + self.eps, size=self.action_dim).astype(\n"," \"int\"\n"," )\n"," # Clip action to be between 0 and max obs screen size\n"," action = np.clip(action, 0, self.action_bound)\n"," # 1 Action per instance of env; Env expects: (num_instances, actions)\n"," action = (action,)\n"," log_policy = self.log_pdf(mu, std, action)\n"," return log_policy, action\n","\n"," def log_pdf(self, mu, std, action):\n"," std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])\n"," var = std ** 2\n"," log_policy_pdf = -0.5 * (action - mu) ** 2 / var - 0.5 * tf.math.log(\n"," var * 2 * np.pi\n"," )\n"," return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)\n","\n"," def compute_loss(self, log_old_policy, log_new_policy, actions, gaes):\n"," # Avoid INF in exp by setting 80 as the upper bound since,\n"," # tf.exp(x) for x>88 yeilds NaN (float32)\n"," ratio = tf.exp(\n"," tf.minimum(log_new_policy - tf.stop_gradient(log_old_policy), 80)\n"," )\n"," gaes = tf.stop_gradient(gaes)\n"," clipped_ratio = tf.clip_by_value(\n"," ratio, 1.0 - args.clip_ratio, 1.0 + args.clip_ratio\n"," )\n"," surrogate = -tf.minimum(ratio * gaes, clipped_ratio * gaes)\n"," return tf.reduce_mean(surrogate)\n","\n"," def train(self, log_old_policy, states, actions, gaes):\n"," with tf.GradientTape() as tape:\n"," mu, std = self.model(states, training=True)\n"," log_new_policy = self.log_pdf(mu, std, actions)\n"," loss = self.compute_loss(log_old_policy, log_new_policy, actions, gaes)\n"," grads = tape.gradient(loss, self.model.trainable_variables)\n"," self.opt.apply_gradients(zip(grads, self.model.trainable_variables))\n"," return loss\n","\n"," def save(self, model_dir: str, version: int = 1):\n"," actor_model_save_dir = os.path.join(\n"," model_dir, \"actor\", str(version), \"model.savedmodel\"\n"," )\n"," self.model.save(actor_model_save_dir, save_format=\"tf\")\n"," print(f\"Actor model saved at:{actor_model_save_dir}\")\n","\n","\n","class Critic:\n"," def __init__(self, state_dim):\n"," self.state_dim = state_dim\n"," self.weight_initializer = tf.keras.initializers.he_normal()\n"," self.model = self.nn_model()\n"," self.model.summary() # Print a summary of the Critic model\n"," self.opt = tf.keras.optimizers.Nadam(args.critic_lr)\n","\n"," def nn_model(self):\n"," obs_input = Input(self.state_dim)\n"," conv1 = Conv2D(\n"," filters=64,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"same\",\n"," input_shape=self.state_dim,\n"," data_format=\"channels_last\",\n"," activation=\"relu\",\n"," )(obs_input)\n"," pool1 = MaxPool2D(pool_size=(3, 3), strides=2)(conv1)\n"," conv2 = Conv2D(\n"," filters=32,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool1)\n"," pool2 = MaxPool2D(pool_size=(3, 3), strides=2)(conv2)\n"," conv3 = Conv2D(\n"," filters=16,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool2)\n"," pool3 = MaxPool2D(pool_size=(3, 3), strides=1)(conv3)\n"," conv4 = Conv2D(\n"," filters=8,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool3)\n"," pool4 = MaxPool2D(pool_size=(3, 3), strides=1)(conv4)\n"," flat = Flatten()(pool4)\n"," dense1 = Dense(\n"," 16, activation=\"relu\", kernel_initializer=self.weight_initializer\n"," )(flat)\n"," dropout1 = Dropout(0.3)(dense1)\n"," dense2 = Dense(\n"," 8, activation=\"relu\", kernel_initializer=self.weight_initializer\n"," )(dropout1)\n"," dropout2 = Dropout(0.3)(dense2)\n"," value = Dense(\n"," 1, activation=\"linear\", kernel_initializer=self.weight_initializer\n"," )(dropout2)\n","\n"," return tf.keras.models.Model(inputs=obs_input, outputs=value, name=\"Critic\")\n","\n"," def compute_loss(self, v_pred, td_targets):\n"," mse = tf.keras.losses.MeanSquaredError()\n"," return mse(td_targets, v_pred)\n","\n"," def train(self, states, td_targets):\n"," with tf.GradientTape() as tape:\n"," v_pred = self.model(states, training=True)\n"," # assert v_pred.shape == td_targets.shape\n"," loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))\n"," grads = tape.gradient(loss, self.model.trainable_variables)\n"," self.opt.apply_gradients(zip(grads, self.model.trainable_variables))\n"," return loss\n","\n"," def save(self, model_dir: str, version: int = 1):\n"," critic_model_save_dir = os.path.join(\n"," model_dir, \"critic\", str(version), \"model.savedmodel\"\n"," )\n"," self.model.save(critic_model_save_dir, save_format=\"tf\")\n"," print(f\"Critic model saved at:{critic_model_save_dir}\")\n","\n","\n","class PPOAgent:\n"," def __init__(self, env):\n"," self.env = env\n"," self.state_dim = self.env.observation_space.shape\n"," self.action_dim = self.env.action_space.shape\n"," # Set action_bounds to be within the actual task-window/browser-view of the agent\n"," self.action_bound = [self.env.task_width, self.env.task_height]\n"," self.std_bound = [1e-2, 1.0]\n","\n"," self.actor = Actor(\n"," self.state_dim, self.action_dim, self.action_bound, self.std_bound\n"," )\n"," self.critic = Critic(self.state_dim)\n","\n"," def gae_target(self, rewards, v_values, next_v_value, done):\n"," n_step_targets = np.zeros_like(rewards)\n"," gae = np.zeros_like(rewards)\n"," gae_cumulative = 0\n"," forward_val = 0\n","\n"," if not done:\n"," forward_val = next_v_value\n","\n"," for k in reversed(range(0, len(rewards))):\n"," delta = rewards[k] + args.gamma * forward_val - v_values[k]\n"," gae_cumulative = args.gamma * args.gae_lambda * gae_cumulative + delta\n"," gae[k] = gae_cumulative\n"," forward_val = v_values[k]\n"," n_step_targets[k] = gae[k] + v_values[k]\n"," return gae, n_step_targets\n","\n"," def train(self, max_episodes=1000):\n"," with writer.as_default():\n"," for ep in range(max_episodes):\n"," state_batch = []\n"," action_batch = []\n"," reward_batch = []\n"," old_policy_batch = []\n","\n"," episode_reward, done = 0, False\n","\n"," state = self.env.reset()\n"," prev_state = state\n"," step_num = 0\n","\n"," while not done:\n"," # self.env.render()\n"," log_old_policy, action = self.actor.get_action(state)\n","\n"," next_state, reward, dones, _ = self.env.step(action)\n"," step_num += 1\n"," print(\n"," f\"ep#:{ep} step#:{step_num} step_rew:{reward} action:{action} dones:{dones}\"\n"," )\n"," done = np.all(dones)\n"," if done:\n"," next_state = prev_state\n"," else:\n"," prev_state = next_state\n"," state = np.array([np.array(s) for s in state])\n"," next_state = np.array([np.array(s) for s in next_state])\n"," reward = np.reshape(reward, [1, 1])\n"," log_old_policy = np.reshape(log_old_policy, [1, 1])\n","\n"," state_batch.append(state)\n"," action_batch.append(action)\n"," reward_batch.append((reward + 8) / 8)\n"," old_policy_batch.append(log_old_policy)\n","\n"," if len(state_batch) >= args.update_freq or done:\n"," states = np.array([state.squeeze() for state in state_batch])\n"," # Convert ([x, y],) to [x, y]\n"," actions = np.array([action[0] for action in action_batch])\n"," rewards = np.array(\n"," [reward.squeeze() for reward in reward_batch]\n"," )\n"," old_policies = np.array(\n"," [old_pi.squeeze() for old_pi in old_policy_batch]\n"," )\n","\n"," v_values = self.critic.model.predict(states)\n"," next_v_value = self.critic.model.predict(next_state)\n","\n"," gaes, td_targets = self.gae_target(\n"," rewards, v_values, next_v_value, done\n"," )\n"," actor_losses, critic_losses = [], []\n"," for epoch in range(args.epochs):\n"," actor_loss = self.actor.train(\n"," old_policies, states, actions, gaes\n"," )\n"," actor_losses.append(actor_loss)\n"," critic_loss = self.critic.train(states, td_targets)\n"," critic_losses.append(critic_loss)\n"," # Plot mean actor & critic losses on every update\n"," tf.summary.scalar(\"actor_loss\", np.mean(actor_losses), step=ep)\n"," tf.summary.scalar(\n"," \"critic_loss\", np.mean(critic_losses), step=ep\n"," )\n","\n"," state_batch = []\n"," action_batch = []\n"," reward_batch = []\n"," old_policy_batch = []\n","\n"," episode_reward += reward[0][0]\n"," state = next_state[0]\n","\n"," print(f\"Episode#{ep} Reward:{episode_reward} Actions:{action_batch}\")\n"," tf.summary.scalar(\"episode_reward\", episode_reward, step=ep)\n","\n"," def save(self, model_dir: str, version: int = 1):\n"," self.actor.save(model_dir, version)\n"," self.critic.save(model_dir, version)\n","\n","\n","if __name__ == \"__main__\":\n"," env_name = args.env\n"," env = gym.make(env_name)\n"," cta_agent = PPOAgent(env)\n"," cta_agent.train(max_episodes=2)\n"," # Model saving\n"," model_dir = \"trained_models\"\n"," agent_name = f\"PPO_{env_name}-v0\"\n"," agent_version = 1\n"," agent_model_path = os.path.join(model_dir, agent_name)\n"," cta_agent.save(agent_model_path, agent_version)"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Model: \"Actor\"\n","__________________________________________________________________________________________________\n"," Layer (type) Output Shape Param # Connected to \n","==================================================================================================\n"," im_obs (InputLayer) [(None, 64, 64, 3)] 0 [] \n"," \n"," conv2d_4 (Conv2D) (None, 64, 64, 64) 1792 ['im_obs[0][0]'] \n"," \n"," max_pooling2d_4 (MaxPooling2D) (None, 62, 62, 64) 0 ['conv2d_4[0][0]'] \n"," \n"," conv2d_5 (Conv2D) (None, 60, 60, 32) 18464 ['max_pooling2d_4[0][0]'] \n"," \n"," max_pooling2d_5 (MaxPooling2D) (None, 58, 58, 32) 0 ['conv2d_5[0][0]'] \n"," \n"," conv2d_6 (Conv2D) (None, 56, 56, 16) 4624 ['max_pooling2d_5[0][0]'] \n"," \n"," max_pooling2d_6 (MaxPooling2D) (None, 54, 54, 16) 0 ['conv2d_6[0][0]'] \n"," \n"," conv2d_7 (Conv2D) (None, 52, 52, 8) 1160 ['max_pooling2d_6[0][0]'] \n"," \n"," max_pooling2d_7 (MaxPooling2D) (None, 50, 50, 8) 0 ['conv2d_7[0][0]'] \n"," \n"," flatten_1 (Flatten) (None, 20000) 0 ['max_pooling2d_7[0][0]'] \n"," \n"," dense_3 (Dense) (None, 16) 320016 ['flatten_1[0][0]'] \n"," \n"," dropout_2 (Dropout) (None, 16) 0 ['dense_3[0][0]'] \n"," \n"," dense_4 (Dense) (None, 8) 136 ['dropout_2[0][0]'] \n"," \n"," dropout_3 (Dropout) (None, 8) 0 ['dense_4[0][0]'] \n"," \n"," dense_5 (Dense) (None, 2) 18 ['dropout_3[0][0]'] \n"," \n"," dense_6 (Dense) (None, 2) 18 ['dropout_3[0][0]'] \n"," \n"," mu_output (Lambda) (None, 2) 0 ['dense_5[0][0]'] \n"," \n"," lambda (Lambda) (None, 2) 0 ['dense_6[0][0]'] \n"," \n","==================================================================================================\n","Total params: 346,228\n","Trainable params: 346,228\n","Non-trainable params: 0\n","__________________________________________________________________________________________________\n","Model: \"Critic\"\n","_________________________________________________________________\n"," Layer (type) Output Shape Param # \n","=================================================================\n"," input_1 (InputLayer) [(None, 64, 64, 3)] 0 \n"," \n"," conv2d_8 (Conv2D) (None, 64, 64, 64) 1792 \n"," \n"," max_pooling2d_8 (MaxPooling (None, 31, 31, 64) 0 \n"," 2D) \n"," \n"," conv2d_9 (Conv2D) (None, 29, 29, 32) 18464 \n"," \n"," max_pooling2d_9 (MaxPooling (None, 14, 14, 32) 0 \n"," 2D) \n"," \n"," conv2d_10 (Conv2D) (None, 12, 12, 16) 4624 \n"," \n"," max_pooling2d_10 (MaxPoolin (None, 10, 10, 16) 0 \n"," g2D) \n"," \n"," conv2d_11 (Conv2D) (None, 8, 8, 8) 1160 \n"," \n"," max_pooling2d_11 (MaxPoolin (None, 6, 6, 8) 0 \n"," g2D) \n"," \n"," flatten_2 (Flatten) (None, 288) 0 \n"," \n"," dense_7 (Dense) (None, 16) 4624 \n"," \n"," dropout_4 (Dropout) (None, 16) 0 \n"," \n"," dense_8 (Dense) (None, 8) 136 \n"," \n"," dropout_5 (Dropout) (None, 8) 0 \n"," \n"," dense_9 (Dense) (None, 1) 9 \n"," \n","=================================================================\n","Total params: 30,809\n","Trainable params: 30,809\n","Non-trainable params: 0\n","_________________________________________________________________\n","ep#:0 step#:1 step_rew:[0.0] action:(array([ 70, 210]),) dones:[False]\n","ep#:0 step#:2 step_rew:[0.0] action:(array([160, 196]),) dones:[False]\n","ep#:0 step#:3 step_rew:[0.0] action:(array([ 43, 193]),) dones:[False]\n","ep#:0 step#:4 step_rew:[0.0] action:(array([160, 209]),) dones:[False]\n","ep#:0 step#:5 step_rew:[0.0] action:(array([160, 205]),) dones:[False]\n","ep#:0 step#:6 step_rew:[0.0] action:(array([160, 210]),) dones:[False]\n","ep#:0 step#:7 step_rew:[0.0] action:(array([132, 202]),) dones:[False]\n","ep#:0 step#:8 step_rew:[0.0] action:(array([160, 201]),) dones:[False]\n","ep#:0 step#:9 step_rew:[0.0] action:(array([ 41, 210]),) dones:[False]\n","ep#:0 step#:10 step_rew:[0.0] action:(array([160, 205]),) dones:[False]\n","ep#:0 step#:11 step_rew:[0.0] action:(array([160, 205]),) dones:[False]\n","ep#:0 step#:12 step_rew:[0.0] action:(array([ 75, 210]),) dones:[False]\n","ep#:0 step#:13 step_rew:[0.0] action:(array([160, 205]),) dones:[False]\n","ep#:0 step#:14 step_rew:[0.0] action:(array([160, 190]),) dones:[False]\n","ep#:0 step#:15 step_rew:[0.0] action:(array([160, 202]),) dones:[False]\n","ep#:0 step#:16 step_rew:[0.0] action:(array([106, 200]),) dones:[False]\n","ep#:0 step#:17 step_rew:[0.0] action:(array([ 51, 188]),) dones:[False]\n","ep#:0 step#:18 step_rew:[0.0] action:(array([ 59, 210]),) dones:[False]\n","ep#:0 step#:19 step_rew:[0.0] action:(array([ 94, 203]),) dones:[False]\n","ep#:0 step#:20 step_rew:[0.0] action:(array([160, 208]),) dones:[False]\n","ep#:0 step#:21 step_rew:[0.0] action:(array([160, 210]),) dones:[False]\n","ep#:0 step#:22 step_rew:[0.0] action:(array([160, 210]),) dones:[False]\n","ep#:0 step#:23 step_rew:[0.0] action:(array([ 50, 207]),) dones:[False]\n","ep#:0 step#:24 step_rew:[0.0] action:(array([137, 210]),) dones:[False]\n","ep#:0 step#:25 step_rew:[0.0] action:(array([160, 210]),) dones:[False]\n","ep#:0 step#:26 step_rew:[0.0] action:(array([133, 204]),) dones:[False]\n","ep#:0 step#:27 step_rew:[0.0] action:(array([ 66, 210]),) dones:[False]\n","ep#:0 step#:28 step_rew:[0.0] action:(array([ 42, 207]),) dones:[False]\n","ep#:0 step#:29 step_rew:[0.0] action:(array([115, 209]),) dones:[False]\n","ep#:0 step#:30 step_rew:[0.0] action:(array([101, 210]),) dones:[False]\n","ep#:0 step#:31 step_rew:[0.0] action:(array([160, 208]),) dones:[False]\n","ep#:0 step#:32 step_rew:[0.0] action:(array([160, 210]),) dones:[False]\n"]},{"output_type":"stream","name":"stderr","text":["WARNING:root:Cannot call CoordClick(coords: (68, 205)) on instance 0, which is already done\n"]},{"output_type":"stream","name":"stdout","text":["ep#:0 step#:33 step_rew:[-1.0] action:(array([ 68, 205]),) dones:[True]\n","Episode#0 Reward:-1.0 Actions:[]\n","ep#:1 step#:1 step_rew:[0.0] action:(array([160, 197]),) dones:[False]\n","ep#:1 step#:2 step_rew:[0.0] action:(array([160, 196]),) dones:[False]\n","ep#:1 step#:3 step_rew:[0.0] action:(array([160, 201]),) dones:[False]\n","ep#:1 step#:4 step_rew:[0.0] action:(array([160, 210]),) dones:[False]\n","ep#:1 step#:5 step_rew:[0.0] action:(array([160, 210]),) dones:[False]\n","ep#:1 step#:6 step_rew:[0.0] action:(array([160, 192]),) dones:[False]\n","ep#:1 step#:7 step_rew:[0.0] action:(array([160, 203]),) dones:[False]\n","ep#:1 step#:8 step_rew:[0.0] action:(array([160, 205]),) dones:[False]\n","ep#:1 step#:9 step_rew:[0.0] action:(array([ 43, 210]),) dones:[False]\n","ep#:1 step#:10 step_rew:[0.0] action:(array([160, 210]),) dones:[False]\n","ep#:1 step#:11 step_rew:[0.0] action:(array([160, 210]),) dones:[False]\n","ep#:1 step#:12 step_rew:[0.0] action:(array([160, 210]),) dones:[False]\n","ep#:1 step#:13 step_rew:[0.0] action:(array([121, 205]),) dones:[False]\n","ep#:1 step#:14 step_rew:[0.0] action:(array([133, 210]),) dones:[False]\n","ep#:1 step#:15 step_rew:[0.0] action:(array([160, 206]),) dones:[False]\n","ep#:1 step#:16 step_rew:[0.0] action:(array([ 30, 210]),) dones:[False]\n","ep#:1 step#:17 step_rew:[0.0] action:(array([129, 210]),) dones:[False]\n","ep#:1 step#:18 step_rew:[0.0] action:(array([160, 193]),) dones:[False]\n","ep#:1 step#:19 step_rew:[0.0] action:(array([109, 204]),) dones:[False]\n","ep#:1 step#:20 step_rew:[0.0] action:(array([ 0, 200]),) dones:[False]\n","ep#:1 step#:21 step_rew:[0.0] action:(array([104, 210]),) dones:[False]\n","ep#:1 step#:22 step_rew:[0.0] action:(array([150, 201]),) dones:[False]\n","ep#:1 step#:23 step_rew:[0.0] action:(array([ 4, 210]),) dones:[False]\n","ep#:1 step#:24 step_rew:[0.0] action:(array([150, 203]),) dones:[False]\n","ep#:1 step#:25 step_rew:[0.0] action:(array([160, 210]),) dones:[False]\n","ep#:1 step#:26 step_rew:[0.0] action:(array([124, 210]),) dones:[False]\n","ep#:1 step#:27 step_rew:[0.0] action:(array([114, 208]),) dones:[False]\n","ep#:1 step#:28 step_rew:[0.0] action:(array([110, 208]),) dones:[False]\n","ep#:1 step#:29 step_rew:[0.0] action:(array([160, 210]),) dones:[False]\n","ep#:1 step#:30 step_rew:[0.0] action:(array([142, 210]),) dones:[False]\n","ep#:1 step#:31 step_rew:[0.0] action:(array([160, 203]),) dones:[False]\n","ep#:1 step#:32 step_rew:[0.0] action:(array([112, 210]),) dones:[False]\n"]},{"output_type":"stream","name":"stderr","text":["WARNING:root:Cannot call CoordClick(coords: (160, 202)) on instance 0, which is already done\n"]},{"output_type":"stream","name":"stdout","text":["ep#:1 step#:33 step_rew:[-1.0] action:(array([160, 202]),) dones:[True]\n","Episode#1 Reward:-1.0 Actions:[]\n","WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n"]},{"output_type":"stream","name":"stderr","text":["WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n"]},{"output_type":"stream","name":"stdout","text":["INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaReplyVisualEnv-v0-v0/actor/1/model.savedmodel/assets\n"]},{"output_type":"stream","name":"stderr","text":["INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaReplyVisualEnv-v0-v0/actor/1/model.savedmodel/assets\n"]},{"output_type":"stream","name":"stdout","text":["Actor model saved at:trained_models/PPO_MiniWoBSocialMediaReplyVisualEnv-v0-v0/actor/1/model.savedmodel\n","WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n"]},{"output_type":"stream","name":"stderr","text":["WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n"]},{"output_type":"stream","name":"stdout","text":["INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaReplyVisualEnv-v0-v0/critic/1/model.savedmodel/assets\n"]},{"output_type":"stream","name":"stderr","text":["INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaReplyVisualEnv-v0-v0/critic/1/model.savedmodel/assets\n"]},{"output_type":"stream","name":"stdout","text":["Critic model saved at:trained_models/PPO_MiniWoBSocialMediaReplyVisualEnv-v0-v0/critic/1/model.savedmodel\n"]}]},{"cell_type":"code","metadata":{"id":"Z2Ln02kN-kyR"},"source":["%tensorboard --logdir /content/logs/TFRL-SocialMedia-Like-Reply-Agent/MiniWoBSocialMediaReplyVisualEnv-v0"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"REy4PvyI-xjo"},"source":["![image.png]()"]},{"cell_type":"markdown","metadata":{"id":"LbSHkK-191Rz"},"source":["## Social Media Mute Agent"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"IVfl2f449c_M","executionInfo":{"status":"ok","timestamp":1638512604578,"user_tz":-330,"elapsed":464,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"871f329d-2829-4675-8d99-7968a82f7c2d"},"source":["parser = argparse.ArgumentParser(prog=\"TFRL-SocialMedia-Mute-User-Agent\")\n","parser.add_argument(\"--env\", default=\"MiniWoBSocialMediaMuteUserVisualEnv-v0\")\n","parser.add_argument(\"--update-freq\", type=int, default=16)\n","parser.add_argument(\"--epochs\", type=int, default=3)\n","parser.add_argument(\"--actor-lr\", type=float, default=1e-4)\n","parser.add_argument(\"--critic-lr\", type=float, default=1e-4)\n","parser.add_argument(\"--clip-ratio\", type=float, default=0.1)\n","parser.add_argument(\"--gae-lambda\", type=float, default=0.95)\n","parser.add_argument(\"--gamma\", type=float, default=0.99)\n","parser.add_argument(\"--logdir\", default=\"logs\")\n","\n","args = parser.parse_args([])\n","logdir = os.path.join(\n"," args.logdir, parser.prog, args.env, datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",")\n","print(f\"Saving training logs to:{logdir}\")\n","writer = tf.summary.create_file_writer(logdir)"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Saving training logs to:logs/TFRL-SocialMedia-Mute-User-Agent/MiniWoBSocialMediaMuteUserVisualEnv-v0/20211203-062327\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"UJEVcH6F9c8e","executionInfo":{"status":"ok","timestamp":1638512681452,"user_tz":-330,"elapsed":41629,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"484098c3-4f51-498f-b730-56e2cf7d8dd8"},"source":["class Actor:\n"," def __init__(self, state_dim, action_dim, action_bound, std_bound):\n"," self.state_dim = state_dim\n"," self.action_dim = action_dim\n"," self.action_bound = np.array(action_bound)\n"," self.std_bound = std_bound\n"," self.weight_initializer = tf.keras.initializers.he_normal()\n"," self.eps = 1e-5\n"," self.model = self.nn_model()\n"," self.model.summary() # Print a summary of the Actor model\n"," self.opt = tf.keras.optimizers.Nadam(args.actor_lr)\n","\n"," def nn_model(self):\n"," obs_input = Input(self.state_dim, name=\"im_obs\")\n"," conv1 = Conv2D(\n"," filters=64,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"same\",\n"," input_shape=self.state_dim,\n"," data_format=\"channels_last\",\n"," activation=\"relu\",\n"," )(obs_input)\n"," pool1 = MaxPool2D(pool_size=(3, 3), strides=1)(conv1)\n"," conv2 = Conv2D(\n"," filters=32,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool1)\n"," pool2 = MaxPool2D(pool_size=(3, 3), strides=1)(conv2)\n"," conv3 = Conv2D(\n"," filters=16,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool2)\n"," pool3 = MaxPool2D(pool_size=(3, 3), strides=1)(conv3)\n"," conv4 = Conv2D(\n"," filters=8,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool3)\n"," pool4 = MaxPool2D(pool_size=(3, 3), strides=1)(conv4)\n"," flat = Flatten()(pool4)\n"," dense1 = Dense(\n"," 16, activation=\"relu\", kernel_initializer=self.weight_initializer\n"," )(flat)\n"," dropout1 = Dropout(0.3)(dense1)\n"," dense2 = Dense(\n"," 8, activation=\"relu\", kernel_initializer=self.weight_initializer\n"," )(dropout1)\n"," dropout2 = Dropout(0.3)(dense2)\n"," # action_dim[0] = 2\n"," output_val = Dense(\n"," self.action_dim[0],\n"," activation=\"relu\",\n"," kernel_initializer=self.weight_initializer,\n"," )(dropout2)\n"," # Scale & clip x[i] to be in range [0, action_bound[i]]\n"," action_bound = copy.deepcopy(self.action_bound)\n"," mu_output = Lambda(\n"," lambda x: tf.clip_by_value(x * action_bound, 1e-9, action_bound),\n"," name=\"mu_output\",\n"," )(output_val)\n"," std_output_1 = Dense(\n"," self.action_dim[0],\n"," activation=\"softplus\",\n"," kernel_initializer=self.weight_initializer,\n"," )(dropout2)\n"," std_output = Lambda(\n"," lambda x: tf.clip_by_value(\n"," x * action_bound, 1e-9, action_bound / 2, name=\"std_output\"\n"," )\n"," )(std_output_1)\n"," return tf.keras.models.Model(\n"," inputs=obs_input, outputs=[mu_output, std_output], name=\"Actor\"\n"," )\n","\n"," def get_action(self, state):\n"," # Convert [Image] to np.array(np.adarray)\n"," state_np = np.array([np.array(s) for s in state])\n"," if len(state_np.shape) == 3:\n"," # Convert (w, h, c) to (1, w, h, c)\n"," state_np = np.expand_dims(state_np, 0)\n"," mu, std = self.model.predict(state_np)\n"," action = np.random.normal(mu[0], std[0] + self.eps, size=self.action_dim).astype(\n"," \"int\"\n"," )\n"," # Clip action to be between 0 and max obs screen size\n"," action = np.clip(action, 0, self.action_bound)\n"," # 1 Action per instance of env; Env expects: (num_instances, actions)\n"," action = (action,)\n"," log_policy = self.log_pdf(mu, std, action)\n"," return log_policy, action\n","\n"," def log_pdf(self, mu, std, action):\n"," std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])\n"," var = std ** 2\n"," log_policy_pdf = -0.5 * (action - mu) ** 2 / var - 0.5 * tf.math.log(\n"," var * 2 * np.pi\n"," )\n"," return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)\n","\n"," def compute_loss(self, log_old_policy, log_new_policy, actions, gaes):\n"," # Avoid INF in exp by setting 80 as the upper bound since,\n"," # tf.exp(x) for x>88 yeilds NaN (float32)\n"," ratio = tf.exp(\n"," tf.minimum(log_new_policy - tf.stop_gradient(log_old_policy), 80)\n"," )\n"," gaes = tf.stop_gradient(gaes)\n"," clipped_ratio = tf.clip_by_value(\n"," ratio, 1.0 - args.clip_ratio, 1.0 + args.clip_ratio\n"," )\n"," surrogate = -tf.minimum(ratio * gaes, clipped_ratio * gaes)\n"," return tf.reduce_mean(surrogate)\n","\n"," def train(self, log_old_policy, states, actions, gaes):\n"," with tf.GradientTape() as tape:\n"," mu, std = self.model(states, training=True)\n"," log_new_policy = self.log_pdf(mu, std, actions)\n"," loss = self.compute_loss(log_old_policy, log_new_policy, actions, gaes)\n"," grads = tape.gradient(loss, self.model.trainable_variables)\n"," self.opt.apply_gradients(zip(grads, self.model.trainable_variables))\n"," return loss\n","\n"," def save(self, model_dir: str, version: int = 1):\n"," actor_model_save_dir = os.path.join(\n"," model_dir, \"actor\", str(version), \"model.savedmodel\"\n"," )\n"," self.model.save(actor_model_save_dir, save_format=\"tf\")\n"," print(f\"Actor model saved at:{actor_model_save_dir}\")\n","\n","\n","class Critic:\n"," def __init__(self, state_dim):\n"," self.state_dim = state_dim\n"," self.weight_initializer = tf.keras.initializers.he_normal()\n"," self.model = self.nn_model()\n"," self.model.summary() # Print a summary of the Critic model\n"," self.opt = tf.keras.optimizers.Nadam(args.critic_lr)\n","\n"," def nn_model(self):\n"," obs_input = Input(self.state_dim)\n"," conv1 = Conv2D(\n"," filters=64,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"same\",\n"," input_shape=self.state_dim,\n"," data_format=\"channels_last\",\n"," activation=\"relu\",\n"," )(obs_input)\n"," pool1 = MaxPool2D(pool_size=(3, 3), strides=2)(conv1)\n"," conv2 = Conv2D(\n"," filters=32,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool1)\n"," pool2 = MaxPool2D(pool_size=(3, 3), strides=2)(conv2)\n"," conv3 = Conv2D(\n"," filters=16,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool2)\n"," pool3 = MaxPool2D(pool_size=(3, 3), strides=1)(conv3)\n"," conv4 = Conv2D(\n"," filters=8,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool3)\n"," pool4 = MaxPool2D(pool_size=(3, 3), strides=1)(conv4)\n"," flat = Flatten()(pool4)\n"," dense1 = Dense(\n"," 16, activation=\"relu\", kernel_initializer=self.weight_initializer\n"," )(flat)\n"," dropout1 = Dropout(0.3)(dense1)\n"," dense2 = Dense(\n"," 8, activation=\"relu\", kernel_initializer=self.weight_initializer\n"," )(dropout1)\n"," dropout2 = Dropout(0.3)(dense2)\n"," value = Dense(\n"," 1, activation=\"linear\", kernel_initializer=self.weight_initializer\n"," )(dropout2)\n","\n"," return tf.keras.models.Model(inputs=obs_input, outputs=value, name=\"Critic\")\n","\n"," def compute_loss(self, v_pred, td_targets):\n"," mse = tf.keras.losses.MeanSquaredError()\n"," return mse(td_targets, v_pred)\n","\n"," def train(self, states, td_targets):\n"," with tf.GradientTape() as tape:\n"," v_pred = self.model(states, training=True)\n"," # assert v_pred.shape == td_targets.shape\n"," loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))\n"," grads = tape.gradient(loss, self.model.trainable_variables)\n"," self.opt.apply_gradients(zip(grads, self.model.trainable_variables))\n"," return loss\n","\n"," def save(self, model_dir: str, version: int = 1):\n"," critic_model_save_dir = os.path.join(\n"," model_dir, \"critic\", str(version), \"model.savedmodel\"\n"," )\n"," self.model.save(critic_model_save_dir, save_format=\"tf\")\n"," print(f\"Critic model saved at:{critic_model_save_dir}\")\n","\n","\n","class PPOAgent:\n"," def __init__(self, env):\n"," self.env = env\n"," self.state_dim = self.env.observation_space.shape\n"," self.action_dim = self.env.action_space.shape\n"," # Set action_bounds to be within the actual task-window/browser-view of the agent\n"," self.action_bound = [self.env.task_width, self.env.task_height]\n"," self.std_bound = [1e-2, 1.0]\n","\n"," self.actor = Actor(\n"," self.state_dim, self.action_dim, self.action_bound, self.std_bound\n"," )\n"," self.critic = Critic(self.state_dim)\n","\n"," def gae_target(self, rewards, v_values, next_v_value, done):\n"," n_step_targets = np.zeros_like(rewards)\n"," gae = np.zeros_like(rewards)\n"," gae_cumulative = 0\n"," forward_val = 0\n","\n"," if not done:\n"," forward_val = next_v_value\n","\n"," for k in reversed(range(0, len(rewards))):\n"," delta = rewards[k] + args.gamma * forward_val - v_values[k]\n"," gae_cumulative = args.gamma * args.gae_lambda * gae_cumulative + delta\n"," gae[k] = gae_cumulative\n"," forward_val = v_values[k]\n"," n_step_targets[k] = gae[k] + v_values[k]\n"," return gae, n_step_targets\n","\n"," def train(self, max_episodes=1000):\n"," with writer.as_default():\n"," for ep in range(max_episodes):\n"," state_batch = []\n"," action_batch = []\n"," reward_batch = []\n"," old_policy_batch = []\n","\n"," episode_reward, done = 0, False\n","\n"," state = self.env.reset()\n"," prev_state = state\n"," step_num = 0\n","\n"," while not done:\n"," # self.env.render()\n"," log_old_policy, action = self.actor.get_action(state)\n","\n"," next_state, reward, dones, _ = self.env.step(action)\n"," step_num += 1\n"," print(\n"," f\"ep#:{ep} step#:{step_num} step_rew:{reward} action:{action} dones:{dones}\"\n"," )\n"," done = np.all(dones)\n"," if done:\n"," next_state = prev_state\n"," else:\n"," prev_state = next_state\n"," state = np.array([np.array(s) for s in state])\n"," next_state = np.array([np.array(s) for s in next_state])\n"," reward = np.reshape(reward, [1, 1])\n"," log_old_policy = np.reshape(log_old_policy, [1, 1])\n","\n"," state_batch.append(state)\n"," action_batch.append(action)\n"," reward_batch.append((reward + 8) / 8)\n"," old_policy_batch.append(log_old_policy)\n","\n"," if len(state_batch) >= args.update_freq or done:\n"," states = np.array([state.squeeze() for state in state_batch])\n"," # Convert ([x, y],) to [x, y]\n"," actions = np.array([action[0] for action in action_batch])\n"," rewards = np.array(\n"," [reward.squeeze() for reward in reward_batch]\n"," )\n"," old_policies = np.array(\n"," [old_pi.squeeze() for old_pi in old_policy_batch]\n"," )\n","\n"," v_values = self.critic.model.predict(states)\n"," next_v_value = self.critic.model.predict(next_state)\n","\n"," gaes, td_targets = self.gae_target(\n"," rewards, v_values, next_v_value, done\n"," )\n"," actor_losses, critic_losses = [], []\n"," for epoch in range(args.epochs):\n"," actor_loss = self.actor.train(\n"," old_policies, states, actions, gaes\n"," )\n"," actor_losses.append(actor_loss)\n"," critic_loss = self.critic.train(states, td_targets)\n"," critic_losses.append(critic_loss)\n"," # Plot mean actor & critic losses on every update\n"," tf.summary.scalar(\"actor_loss\", np.mean(actor_losses), step=ep)\n"," tf.summary.scalar(\n"," \"critic_loss\", np.mean(critic_losses), step=ep\n"," )\n","\n"," state_batch = []\n"," action_batch = []\n"," reward_batch = []\n"," old_policy_batch = []\n","\n"," episode_reward += reward[0][0]\n"," state = next_state[0]\n","\n"," print(f\"Episode#{ep} Reward:{episode_reward} Actions:{action_batch}\")\n"," tf.summary.scalar(\"episode_reward\", episode_reward, step=ep)\n","\n"," def save(self, model_dir: str, version: int = 1):\n"," self.actor.save(model_dir, version)\n"," self.critic.save(model_dir, version)\n","\n","\n","if __name__ == \"__main__\":\n"," env_name = args.env\n"," env = gym.make(env_name)\n"," cta_agent = PPOAgent(env)\n"," cta_agent.train(max_episodes=2)\n"," # Model saving\n"," model_dir = \"trained_models\"\n"," agent_name = f\"PPO_{env_name}\"\n"," agent_version = 1\n"," agent_model_path = os.path.join(model_dir, agent_name)\n"," cta_agent.save(agent_model_path, agent_version)"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Model: \"Actor\"\n","__________________________________________________________________________________________________\n"," Layer (type) Output Shape Param # Connected to \n","==================================================================================================\n"," im_obs (InputLayer) [(None, 64, 64, 3)] 0 [] \n"," \n"," conv2d_20 (Conv2D) (None, 64, 64, 64) 1792 ['im_obs[0][0]'] \n"," \n"," max_pooling2d_20 (MaxPooling2D (None, 62, 62, 64) 0 ['conv2d_20[0][0]'] \n"," ) \n"," \n"," conv2d_21 (Conv2D) (None, 60, 60, 32) 18464 ['max_pooling2d_20[0][0]'] \n"," \n"," max_pooling2d_21 (MaxPooling2D (None, 58, 58, 32) 0 ['conv2d_21[0][0]'] \n"," ) \n"," \n"," conv2d_22 (Conv2D) (None, 56, 56, 16) 4624 ['max_pooling2d_21[0][0]'] \n"," \n"," max_pooling2d_22 (MaxPooling2D (None, 54, 54, 16) 0 ['conv2d_22[0][0]'] \n"," ) \n"," \n"," conv2d_23 (Conv2D) (None, 52, 52, 8) 1160 ['max_pooling2d_22[0][0]'] \n"," \n"," max_pooling2d_23 (MaxPooling2D (None, 50, 50, 8) 0 ['conv2d_23[0][0]'] \n"," ) \n"," \n"," flatten_5 (Flatten) (None, 20000) 0 ['max_pooling2d_23[0][0]'] \n"," \n"," dense_17 (Dense) (None, 16) 320016 ['flatten_5[0][0]'] \n"," \n"," dropout_10 (Dropout) (None, 16) 0 ['dense_17[0][0]'] \n"," \n"," dense_18 (Dense) (None, 8) 136 ['dropout_10[0][0]'] \n"," \n"," dropout_11 (Dropout) (None, 8) 0 ['dense_18[0][0]'] \n"," \n"," dense_19 (Dense) (None, 2) 18 ['dropout_11[0][0]'] \n"," \n"," dense_20 (Dense) (None, 2) 18 ['dropout_11[0][0]'] \n"," \n"," mu_output (Lambda) (None, 2) 0 ['dense_19[0][0]'] \n"," \n"," lambda_2 (Lambda) (None, 2) 0 ['dense_20[0][0]'] \n"," \n","==================================================================================================\n","Total params: 346,228\n","Trainable params: 346,228\n","Non-trainable params: 0\n","__________________________________________________________________________________________________\n","Model: \"Critic\"\n","_________________________________________________________________\n"," Layer (type) Output Shape Param # \n","=================================================================\n"," input_3 (InputLayer) [(None, 64, 64, 3)] 0 \n"," \n"," conv2d_24 (Conv2D) (None, 64, 64, 64) 1792 \n"," \n"," max_pooling2d_24 (MaxPoolin (None, 31, 31, 64) 0 \n"," g2D) \n"," \n"," conv2d_25 (Conv2D) (None, 29, 29, 32) 18464 \n"," \n"," max_pooling2d_25 (MaxPoolin (None, 14, 14, 32) 0 \n"," g2D) \n"," \n"," conv2d_26 (Conv2D) (None, 12, 12, 16) 4624 \n"," \n"," max_pooling2d_26 (MaxPoolin (None, 10, 10, 16) 0 \n"," g2D) \n"," \n"," conv2d_27 (Conv2D) (None, 8, 8, 8) 1160 \n"," \n"," max_pooling2d_27 (MaxPoolin (None, 6, 6, 8) 0 \n"," g2D) \n"," \n"," flatten_6 (Flatten) (None, 288) 0 \n"," \n"," dense_21 (Dense) (None, 16) 4624 \n"," \n"," dropout_12 (Dropout) (None, 16) 0 \n"," \n"," dense_22 (Dense) (None, 8) 136 \n"," \n"," dropout_13 (Dropout) (None, 8) 0 \n"," \n"," dense_23 (Dense) (None, 1) 9 \n"," \n","=================================================================\n","Total params: 30,809\n","Trainable params: 30,809\n","Non-trainable params: 0\n","_________________________________________________________________\n","ep#:0 step#:1 step_rew:[0.0] action:(array([ 93, 111]),) dones:[False]\n","ep#:0 step#:2 step_rew:[0.0] action:(array([ 0, 152]),) dones:[False]\n","ep#:0 step#:3 step_rew:[0.0] action:(array([91, 35]),) dones:[False]\n","ep#:0 step#:4 step_rew:[0.0] action:(array([20, 0]),) dones:[False]\n","ep#:0 step#:5 step_rew:[0.0] action:(array([19, 0]),) dones:[False]\n","ep#:0 step#:6 step_rew:[0.0] action:(array([60, 45]),) dones:[False]\n","ep#:0 step#:7 step_rew:[0.0] action:(array([ 0, 80]),) dones:[False]\n","ep#:0 step#:8 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:0 step#:9 step_rew:[0.0] action:(array([143, 0]),) dones:[False]\n","ep#:0 step#:10 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:0 step#:11 step_rew:[0.0] action:(array([ 0, 142]),) dones:[False]\n","ep#:0 step#:12 step_rew:[0.0] action:(array([ 93, 115]),) dones:[False]\n","ep#:0 step#:13 step_rew:[0.0] action:(array([ 0, 105]),) dones:[False]\n","ep#:0 step#:14 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:0 step#:15 step_rew:[0.0] action:(array([ 26, 113]),) dones:[False]\n","ep#:0 step#:16 step_rew:[0.0] action:(array([97, 0]),) dones:[False]\n","ep#:0 step#:17 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:0 step#:18 step_rew:[0.0] action:(array([160, 0]),) dones:[False]\n","ep#:0 step#:19 step_rew:[0.0] action:(array([110, 51]),) dones:[False]\n","ep#:0 step#:20 step_rew:[0.0] action:(array([84, 0]),) dones:[False]\n","ep#:0 step#:21 step_rew:[0.0] action:(array([50, 0]),) dones:[False]\n","ep#:0 step#:22 step_rew:[0.0] action:(array([ 0, 15]),) dones:[False]\n","ep#:0 step#:23 step_rew:[0.0] action:(array([54, 0]),) dones:[False]\n","ep#:0 step#:24 step_rew:[0.0] action:(array([ 0, 73]),) dones:[False]\n","ep#:0 step#:25 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n"]},{"output_type":"stream","name":"stderr","text":["WARNING:root:Cannot call CoordClick(coords: (0, 14)) on instance 0, which is already done\n"]},{"output_type":"stream","name":"stdout","text":["ep#:0 step#:26 step_rew:[0.0] action:(array([80, 81]),) dones:[False]\n","ep#:0 step#:27 step_rew:[-1.0] action:(array([ 0, 14]),) dones:[True]\n","Episode#0 Reward:-1.0 Actions:[]\n","ep#:1 step#:1 step_rew:[0.0] action:(array([ 0, 48]),) dones:[False]\n","ep#:1 step#:2 step_rew:[0.0] action:(array([60, 83]),) dones:[False]\n","ep#:1 step#:3 step_rew:[0.0] action:(array([95, 0]),) dones:[False]\n","ep#:1 step#:4 step_rew:[0.0] action:(array([ 0, 184]),) dones:[False]\n","ep#:1 step#:5 step_rew:[0.0] action:(array([0, 4]),) dones:[False]\n","ep#:1 step#:6 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:1 step#:7 step_rew:[0.0] action:(array([160, 0]),) dones:[False]\n","ep#:1 step#:8 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:1 step#:9 step_rew:[0.0] action:(array([29, 43]),) dones:[False]\n","ep#:1 step#:10 step_rew:[0.0] action:(array([ 0, 81]),) dones:[False]\n","ep#:1 step#:11 step_rew:[0.0] action:(array([65, 0]),) dones:[False]\n","ep#:1 step#:12 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:1 step#:13 step_rew:[0.0] action:(array([ 0, 210]),) dones:[False]\n","ep#:1 step#:14 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:1 step#:15 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:1 step#:16 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:1 step#:17 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:1 step#:18 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:1 step#:19 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:1 step#:20 step_rew:[0.0] action:(array([0, 0]),) dones:[False]\n","ep#:1 step#:21 step_rew:[0.0] action:(array([ 0, 50]),) dones:[False]\n","ep#:1 step#:22 step_rew:[0.0] action:(array([ 0, 80]),) dones:[False]\n","ep#:1 step#:23 step_rew:[0.0] action:(array([31, 0]),) dones:[False]\n","ep#:1 step#:24 step_rew:[0.0] action:(array([ 84, 161]),) dones:[False]\n","ep#:1 step#:25 step_rew:[0.0] action:(array([ 0, 28]),) dones:[False]\n","ep#:1 step#:26 step_rew:[0.0] action:(array([1, 0]),) dones:[False]\n"]},{"output_type":"stream","name":"stderr","text":["WARNING:root:Cannot call CoordClick(coords: (0, 140)) on instance 0, which is already done\n"]},{"output_type":"stream","name":"stdout","text":["ep#:1 step#:27 step_rew:[0.0] action:(array([ 0, 172]),) dones:[False]\n","ep#:1 step#:28 step_rew:[-1.0] action:(array([ 0, 140]),) dones:[True]\n","Episode#1 Reward:-1.0 Actions:[]\n","WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n"]},{"output_type":"stream","name":"stderr","text":["WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n"]},{"output_type":"stream","name":"stdout","text":["INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaMuteUserVisualEnv-v0/actor/1/model.savedmodel/assets\n"]},{"output_type":"stream","name":"stderr","text":["INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaMuteUserVisualEnv-v0/actor/1/model.savedmodel/assets\n"]},{"output_type":"stream","name":"stdout","text":["Actor model saved at:trained_models/PPO_MiniWoBSocialMediaMuteUserVisualEnv-v0/actor/1/model.savedmodel\n","WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n"]},{"output_type":"stream","name":"stderr","text":["WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n"]},{"output_type":"stream","name":"stdout","text":["INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaMuteUserVisualEnv-v0/critic/1/model.savedmodel/assets\n"]},{"output_type":"stream","name":"stderr","text":["INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaMuteUserVisualEnv-v0/critic/1/model.savedmodel/assets\n"]},{"output_type":"stream","name":"stdout","text":["Critic model saved at:trained_models/PPO_MiniWoBSocialMediaMuteUserVisualEnv-v0/critic/1/model.savedmodel\n"]}]},{"cell_type":"code","metadata":{"id":"D_2THOMX-2E7"},"source":["%tensorboard --logdir /content/logs/TFRL-SocialMedia-Mute-User-Agent/MiniWoBSocialMediaMuteUserVisualEnv-v0"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"tNF2n2F5_Hsy"},"source":["![image.png]()"]},{"cell_type":"markdown","metadata":{"id":"VlOVmEIW95ae"},"source":["## Social Media Mute Agent DDPG"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"c42WjLsm-AZL","executionInfo":{"status":"ok","timestamp":1638512751595,"user_tz":-330,"elapsed":495,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"04a53a2c-5992-4d07-e720-3af8650c63cf"},"source":["parser = argparse.ArgumentParser(\n"," prog=\"TFRL-SocialMedia-Mute-User-DDPGAgent\"\n",")\n","parser.add_argument(\"--env\", default=\"MiniWoBSocialMediaMuteUserVisualEnv-v0\")\n","parser.add_argument(\"--actor_lr\", type=float, default=0.0005)\n","parser.add_argument(\"--critic_lr\", type=float, default=0.001)\n","parser.add_argument(\"--batch_size\", type=int, default=64)\n","parser.add_argument(\"--tau\", type=float, default=0.05)\n","parser.add_argument(\"--gamma\", type=float, default=0.99)\n","parser.add_argument(\"--train_start\", type=int, default=2000)\n","parser.add_argument(\"--logdir\", default=\"logs\")\n","\n","args = parser.parse_args([])\n","logdir = os.path.join(\n"," args.logdir, parser.prog, args.env, datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",")\n","print(f\"Saving training logs to:{logdir}\")\n","writer = tf.summary.create_file_writer(logdir)"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Saving training logs to:logs/TFRL-SocialMedia-Mute-User-DDPGAgent/MiniWoBSocialMediaMuteUserVisualEnv-v0/20211203-062554\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"mBdf2ZsM-AWs","executionInfo":{"status":"ok","timestamp":1638512832169,"user_tz":-330,"elapsed":33164,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"e35c00bb-270e-481c-e073-fbad2c65b974"},"source":["class ReplayBuffer:\n"," def __init__(self, capacity=10000):\n"," self.buffer = deque(maxlen=capacity)\n","\n"," def store(self, state, action, reward, next_state, done):\n"," self.buffer.append([state, action, reward, next_state, done])\n","\n"," def sample(self):\n"," sample = random.sample(self.buffer, args.batch_size)\n"," states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))\n"," states = np.array(states).reshape(args.batch_size, -1)\n"," next_states = np.array(next_states).reshape(args.batch_size, -1)\n"," return states, actions, rewards, next_states, done\n","\n"," def size(self):\n"," return len(self.buffer)\n","\n","\n","class Actor:\n"," def __init__(self, state_dim, action_dim, action_bound):\n"," self.state_dim = state_dim\n"," self.action_dim = action_dim\n"," self.action_bound = action_bound\n"," self.weight_initializer = tf.keras.initializers.he_normal()\n"," self.eps = 1e-5\n"," self.model = self.nn_model()\n"," self.opt = tf.keras.optimizers.Adam(args.actor_lr)\n","\n"," def nn_model(self):\n"," obs_input = Input(self.state_dim)\n"," conv1 = Conv2D(\n"," filters=64,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"same\",\n"," input_shape=self.state_dim,\n"," data_format=\"channels_last\",\n"," activation=\"relu\",\n"," )(obs_input)\n"," pool1 = MaxPool2D(pool_size=(3, 3), strides=1)(conv1)\n"," conv2 = Conv2D(\n"," filters=32,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool1)\n"," pool2 = MaxPool2D(pool_size=(3, 3), strides=1)(conv2)\n"," conv3 = Conv2D(\n"," filters=16,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool2)\n"," pool3 = MaxPool2D(pool_size=(3, 3), strides=1)(conv3)\n"," conv4 = Conv2D(\n"," filters=8,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool3)\n"," pool4 = MaxPool2D(pool_size=(3, 3), strides=1)(conv4)\n"," flat = Flatten()(pool4)\n"," dense1 = Dense(\n"," 16, activation=\"relu\", kernel_initializer=self.weight_initializer\n"," )(flat)\n"," dropout1 = Dropout(0.3)(dense1)\n"," dense2 = Dense(\n"," 8, activation=\"relu\", kernel_initializer=self.weight_initializer\n"," )(dropout1)\n"," dropout2 = Dropout(0.3)(dense2)\n"," # action_dim[0] = 2\n"," output_val = Dense(\n"," self.action_dim[0],\n"," activation=\"relu\",\n"," kernel_initializer=self.weight_initializer,\n"," )(dropout2)\n"," # Scale & clip x[i] to be in range [0, action_bound[i]]\n"," mu_output = Lambda(\n"," lambda x: tf.clip_by_value(x * self.action_bound, 1e-9, self.action_bound)\n"," )(output_val)\n"," return tf.keras.models.Model(inputs=obs_input, outputs=mu_output, name=\"Actor\")\n","\n"," def train(self, states, q_grads):\n"," with tf.GradientTape() as tape:\n"," grads = tape.gradient(\n"," self.model(states), self.model.trainable_variables, -q_grads\n"," )\n"," self.opt.apply_gradients(zip(grads, self.model.trainable_variables))\n","\n"," def predict(self, state):\n"," return self.model.predict(state)\n","\n"," def get_action(self, state):\n"," # Convert [Image] to np.array(np.adarray)\n"," state_np = np.array([np.array(s) for s in state])\n"," if len(state_np.shape) == 3:\n"," # Convert (w, h, c) to (1, w, h, c)\n"," state_np = np.expand_dims(state_np, 0)\n"," action = self.model.predict(state_np)\n"," # Clip action to be between 0 and max obs screen size\n"," action = np.clip(action, 0, self.action_bound)\n"," # 1 Action per instance of env; Env expects: (num_instances, actions)\n"," return action\n","\n","\n","class Critic:\n"," def __init__(self, state_dim, action_dim):\n"," self.state_dim = state_dim\n"," self.action_dim = action_dim\n"," self.weight_initializer = tf.keras.initializers.he_normal()\n"," self.model = self.nn_model()\n"," self.opt = tf.keras.optimizers.Adam(args.critic_lr)\n","\n"," def nn_model(self):\n"," obs_input = Input(self.state_dim)\n"," conv1 = Conv2D(\n"," filters=64,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"same\",\n"," input_shape=self.state_dim,\n"," data_format=\"channels_last\",\n"," activation=\"relu\",\n"," )(obs_input)\n"," pool1 = MaxPool2D(pool_size=(3, 3), strides=2)(conv1)\n"," conv2 = Conv2D(\n"," filters=32,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool1)\n"," pool2 = MaxPool2D(pool_size=(3, 3), strides=2)(conv2)\n"," conv3 = Conv2D(\n"," filters=16,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool2)\n"," pool3 = MaxPool2D(pool_size=(3, 3), strides=1)(conv3)\n"," conv4 = Conv2D(\n"," filters=8,\n"," kernel_size=(3, 3),\n"," strides=(1, 1),\n"," padding=\"valid\",\n"," activation=\"relu\",\n"," )(pool3)\n"," pool4 = MaxPool2D(pool_size=(3, 3), strides=1)(conv4)\n"," flat = Flatten()(pool4)\n"," dense1 = Dense(\n"," 16, activation=\"relu\", kernel_initializer=self.weight_initializer\n"," )(flat)\n"," dropout1 = Dropout(0.3)(dense1)\n"," dense2 = Dense(\n"," 8, activation=\"relu\", kernel_initializer=self.weight_initializer\n"," )(dropout1)\n"," dropout2 = Dropout(0.3)(dense2)\n"," value = Dense(\n"," 1, activation=\"linear\", kernel_initializer=self.weight_initializer\n"," )(dropout2)\n","\n"," return tf.keras.models.Model(inputs=obs_input, outputs=value, name=\"Critic\")\n","\n"," def predict(self, inputs):\n"," return self.model.predict(inputs)\n","\n"," def q_gradients(self, states, actions):\n"," actions = tf.convert_to_tensor(actions)\n"," with tf.GradientTape() as tape:\n"," tape.watch(actions)\n"," q_values = self.model([states, actions])\n"," q_values = tf.squeeze(q_values)\n"," return tape.gradient(q_values, actions)\n","\n"," def compute_loss(self, v_pred, td_targets):\n"," mse = tf.keras.losses.MeanSquaredError()\n"," return mse(td_targets, v_pred)\n","\n"," def train(self, states, actions, td_targets):\n"," with tf.GradientTape() as tape:\n"," v_pred = self.model([states, actions], training=True)\n"," assert v_pred.shape == td_targets.shape\n"," loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))\n"," grads = tape.gradient(loss, self.model.trainable_variables)\n"," self.opt.apply_gradients(zip(grads, self.model.trainable_variables))\n"," return loss\n","\n","\n","class DDPGAgent:\n"," def __init__(self, env):\n"," self.env = env\n"," self.state_dim = self.env.observation_space.shape\n"," self.action_dim = self.env.action_space.shape\n"," self.action_bound = self.env.action_space.high\n","\n"," self.buffer = ReplayBuffer()\n","\n"," self.actor = Actor(self.state_dim, self.action_dim, self.action_bound)\n"," self.critic = Critic(self.state_dim, self.action_dim)\n","\n"," self.target_actor = Actor(self.state_dim, self.action_dim, self.action_bound)\n"," self.target_critic = Critic(self.state_dim, self.action_dim)\n","\n"," actor_weights = self.actor.model.get_weights()\n"," critic_weights = self.critic.model.get_weights()\n"," self.target_actor.model.set_weights(actor_weights)\n"," self.target_critic.model.set_weights(critic_weights)\n","\n"," def update_target(self):\n"," actor_weights = self.actor.model.get_weights()\n"," t_actor_weights = self.target_actor.model.get_weights()\n"," critic_weights = self.critic.model.get_weights()\n"," t_critic_weights = self.target_critic.model.get_weights()\n","\n"," for i in range(len(actor_weights)):\n"," t_actor_weights[i] = (\n"," args.tau * actor_weights[i] + (1 - args.tau) * t_actor_weights[i]\n"," )\n","\n"," for i in range(len(critic_weights)):\n"," t_critic_weights[i] = (\n"," args.tau * critic_weights[i] + (1 - args.tau) * t_critic_weights[i]\n"," )\n","\n"," self.target_actor.model.set_weights(t_actor_weights)\n"," self.target_critic.model.set_weights(t_critic_weights)\n","\n"," def get_td_target(self, rewards, q_values, dones):\n"," targets = np.asarray(q_values)\n"," for i in range(q_values.shape[0]):\n"," if dones[i]:\n"," targets[i] = rewards[i]\n"," else:\n"," targets[i] = args.gamma * q_values[i]\n"," return targets\n","\n"," def add_ou_noise(self, x, rho=0.15, mu=0, dt=1e-1, sigma=0.2, dim=1):\n"," return (\n"," x + rho * (mu - x) * dt + sigma * np.sqrt(dt) * np.random.normal(size=dim)\n"," )\n","\n"," def replay_experience(self):\n"," for _ in range(10):\n"," states, actions, rewards, next_states, dones = self.buffer.sample()\n"," target_q_values = self.target_critic.predict(\n"," [next_states, self.target_actor.predict(next_states)]\n"," )\n"," td_targets = self.get_td_target(rewards, target_q_values, dones)\n","\n"," self.critic.train(states, actions, td_targets)\n","\n"," s_actions = self.actor.predict(states)\n"," s_grads = self.critic.q_gradients(states, s_actions)\n"," grads = np.array(s_grads).reshape((-1, self.action_dim))\n"," self.actor.train(states, grads)\n"," self.update_target()\n","\n"," def train(self, max_episodes=1000):\n"," with writer.as_default():\n"," for ep in range(max_episodes):\n"," step_num, episode_reward, done = 0, 0, False\n","\n"," state = self.env.reset()\n"," prev_state = state\n"," bg_noise = np.random.randint(\n"," self.env.action_space.low,\n"," self.env.action_space.high,\n"," self.env.action_space.shape,\n"," )\n"," while not done:\n"," # self.env.render()\n"," action = self.actor.get_action(state)\n"," noise = self.add_ou_noise(bg_noise, dim=self.action_dim)\n"," action = np.clip(action + noise, 0, self.action_bound).astype(\"int\")\n","\n"," next_state, reward, dones, _ = self.env.step(action)\n"," done = np.all(dones)\n"," if done:\n"," next_state = prev_state\n"," else:\n"," prev_state = next_state\n","\n"," for (s, a, r, s_n, d) in zip(\n"," next_state, action, reward, next_state, dones\n"," ):\n"," self.buffer.store(s, a, (r + 8) / 8, s_n, d)\n"," episode_reward += r\n","\n"," step_num += 1 # 1 across num_instances\n"," print(\n"," f\"ep#:{ep} step#:{step_num} step_rew:{reward} action:{action} dones:{dones}\"\n"," )\n","\n"," bg_noise = noise\n"," state = next_state\n"," if (\n"," self.buffer.size() >= args.batch_size\n"," and self.buffer.size() >= args.train_start\n"," ):\n"," self.replay_experience()\n"," print(f\"Episode#{ep} Reward:{episode_reward}\")\n"," tf.summary.scalar(\"episode_reward\", episode_reward, step=ep)\n","\n","\n","if __name__ == \"__main__\":\n"," env_name = \"MiniWoBSocialMediaMuteUserVisualEnv-v0\"\n"," env = gym.make(env_name)\n"," agent = DDPGAgent(env)\n"," agent.train(max_episodes=2)"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["ep#:0 step#:1 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:2 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:3 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:4 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:5 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:6 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:7 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:8 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:9 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:10 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:11 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:12 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:13 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:14 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:15 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:16 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:17 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:18 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:19 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:20 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:21 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:22 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:23 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:24 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:25 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:26 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:27 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:28 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:29 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:30 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:31 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:32 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:33 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:34 step_rew:[0.0] action:[[160 210]] dones:[False]\n"]},{"output_type":"stream","name":"stderr","text":["WARNING:root:Cannot call CoordClick(coords: (160, 210)) on instance 0, which is already done\n"]},{"output_type":"stream","name":"stdout","text":["ep#:0 step#:35 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:0 step#:36 step_rew:[-1.0] action:[[160 210]] dones:[True]\n","Episode#0 Reward:-1.0\n","ep#:1 step#:1 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:2 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:3 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:4 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:5 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:6 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:7 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:8 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:9 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:10 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:11 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:12 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:13 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:14 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:15 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:16 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:17 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:18 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:19 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:20 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:21 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:22 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:23 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:24 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:25 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:26 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:27 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:28 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:29 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:30 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:31 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:32 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:33 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:34 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:35 step_rew:[0.0] action:[[160 210]] dones:[False]\n"]},{"output_type":"stream","name":"stderr","text":["WARNING:root:Cannot call CoordClick(coords: (160, 210)) on instance 0, which is already done\n"]},{"output_type":"stream","name":"stdout","text":["ep#:1 step#:36 step_rew:[0.0] action:[[160 210]] dones:[False]\n","ep#:1 step#:37 step_rew:[-1.0] action:[[160 210]] dones:[True]\n","Episode#1 Reward:-1.0\n"]}]},{"cell_type":"code","metadata":{"id":"rx8UzI52dmn4"},"source":["%tensorboard --logdir /content/logs/TFRL-SocialMedia-Mute-User-DDPGAgent/MiniWoBSocialMediaMuteUserVisualEnv-v0"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"E2ITqp9G_hAN"},"source":["![image.png]()"]},{"cell_type":"markdown","metadata":{"id":"y_ucjqQ9tuZk"},"source":["---"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"bW0aBsS5tuZs","executionInfo":{"status":"ok","timestamp":1638513146554,"user_tz":-330,"elapsed":4127,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"80ec06f9-f3be-4809-bd59-1544c47260fe"},"source":["!pip install -q watermark\n","%reload_ext watermark\n","%watermark -a \"Sparsh A.\" -m -iv -u -t -d -p selenium"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Author: Sparsh A.\n","\n","Last updated: 2021-12-03 06:32:29\n","\n","selenium: 4.1.0\n","\n","Compiler : GCC 7.5.0\n","OS : Linux\n","Release : 5.4.104+\n","Machine : x86_64\n","Processor : x86_64\n","CPU cores : 2\n","Architecture: 64bit\n","\n","argparse : 1.1\n","sys : 3.7.12 (default, Sep 10 2021, 00:21:48) \n","[GCC 7.5.0]\n","gym : 0.17.3\n","tensorflow: 2.7.0\n","IPython : 5.5.0\n","numpy : 1.19.5\n","\n"]}]},{"cell_type":"markdown","metadata":{"id":"ilEAuT87tuZt"},"source":["---"]},{"cell_type":"markdown","metadata":{"id":"QXts9r8TtuZv"},"source":["**END**"]}]}