# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
#print(os.listdir("../input/Data/Stocks"))

# Any results you write to the current directory are saved as output. 101):\n eps = batch[i:i+101]\n episodes.append(eps)\n data = np.stack(episodes)\n assert len(data.shape) == 3\n assert data.shape[-1] == 100\n return data", "execution_count": 9, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "d91c72d82ff718ce383e90d559914dd6ebd970c7" }, "cell_type": "code", "source": "class RandomTrader():\n def get_action(self):\n action = np.random.rand(100) * 2 - 1\n action = action * (np.abs(action) / np.sum(np.abs(action)))\n return action", "execution_count": 10, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "ad47a700332256944cb25d5c8d61e644d539f1a5", "collapsed": true }, "cell_type": "code", "source": "import sys\n#import gym\nimport numpy as np\nfrom scipy.stats import norm\nfrom keras.layers import Dense, Input, Lambda, LSTM\nfrom keras.models import Model\nfrom keras.optimizers import Adam\nfrom keras import backend as K\nfrom collections import deque\nimport random\n\nEPISODES = 3000\n\n\n# A2C(Advantage Actor-Critic) agent for the Cartpole\nclass A2CAgent:\n def __init__(self, state_size, state_seq_length, action_size):\n # if you want to see Cartpole learning, then change to True\n self.render = False\n self.state_size = state_size\n self.state_seq_length = state_seq_length\n self.action_size = action_size\n self.value_size = 1\n \n self.exp_replay = deque(maxlen=2000)\n\n # get gym environment name\n # these are hyper parameters for the A3C\n self.actor_lr = 0.0001\n self.critic_lr = 0.001\n self.discount_factor = .9\n\n # create model for actor and critic network\n self.actor, self.critic = self.build_model()\n\n # method for training actor and critic network\n #self.optimizer = [self.actor_optimizer(), self.critic_optimizer()]\n \n self.optimize_actor = self.actor_optimizer() #5\n self.optimize_critic = self.critic_optimizer() \n\n\n def build_model(self):\n state = Input(batch_shape=(None, self.state_seq_length, self.state_size))\n \n x = LSTM(120,return_sequences=True)(state)\n x = LSTM(100)(x)\n \n actor_input = Dense(100, activation='relu', kernel_initializer='he_uniform')(x)\n # actor_hidden = Dense(self.hidden2, activation='relu')(actor_input)\n mu = Dense(self.action_size, activation='tanh', kernel_initializer='he_uniform')(actor_input)\n sigma_0 = Dense(self.action_size, activation='softplus', kernel_initializer='he_uniform')(actor_input)\n sigma = Lambda(lambda x: x + 0.0001)(sigma_0)\n\n critic_input = Dense(30, activation='relu', kernel_initializer='he_uniform')(x)\n # value_hidden = Dense(self.hidden2, activation='relu')(critic_input)\n state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(critic_input)\n\n actor = Model(inputs=state, outputs=(mu, sigma))\n critic = Model(inputs=state, outputs=state_value)\n\n actor._make_predict_function()\n critic._make_predict_function()\n\n actor.summary()\n critic.summary()\n\n return actor, critic\n\n def actor_optimizer(self):\n action = K.placeholder(shape=(None, 1))\n advantages = K.placeholder(shape=(None, 1))\n\n # mu = K.placeholder(shape=(None, self.action_size))\n # sigma_sq = K.placeholder(shape=(None, self.action_size))\n\n mu, sigma_sq = self.actor.output\n\n pdf = 1. / K.sqrt(2. * np.pi * sigma_sq) * K.exp(-K.square(action - mu) / (2. * sigma_sq))\n log_pdf = K.log(pdf + K.epsilon())\n entropy = K.sum(0.5 * (K.log(2. * np.pi * sigma_sq) + 1.))\n\n exp_v = log_pdf * advantages\n\n exp_v = K.sum(exp_v + 0.01 * entropy)\n actor_loss = -exp_v\n\n optimizer = Adam(lr=self.actor_lr)\n updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss)\n\n train = K.function([self.actor.input, action, advantages], [], updates=updates)\n return train\n\n # make loss function for Value approximation\n def critic_optimizer(self):\n discounted_reward = K.placeholder(shape=(None, 1))\n\n value = self.critic.output\n\n loss = K.mean(K.square(discounted_reward - value))\n\n optimizer = Adam(lr=self.critic_lr)\n updates = optimizer.get_updates(self.critic.trainable_weights, [], loss)\n train = K.function([self.critic.input, discounted_reward], [], updates=updates)\n return train\n\n # using the output of policy network, pick action stochastically\n def get_action(self, state):\n mu, sigma_sq = self.actor.predict(np.reshape(state, [1, self.state_seq_length,self.state_size]))\n # sigma_sq = np.log(np.exp(sigma_sq + 1))\n epsilon = np.random.randn(self.action_size)\n # action = norm.rvs(loc=mu, scale=sigma_sq,size=1)\n action = mu + np.sqrt(sigma_sq) * epsilon\n action = np.clip(action, -2, 2)\n return action\n\n # update policy network every episode\n def train_model(self, state, action, reward, next_state, done):\n self.exp_replay.append((state, action, reward, next_state, done))\n \n (state, action, reward, next_state, done) = random.sample(self.exp_replay,1)[0]\n \n target = np.zeros((1, self.value_size))\n advantages = np.zeros((1, self.action_size))\n\n value = self.critic.predict(state)[0]\n next_value = self.critic.predict(next_state)[0]\n\n if done:\n advantages[0] = reward - value\n target[0][0] = reward\n else:\n advantages[0] = reward + self.discount_factor * (next_value) - value\n target[0][0] = reward + self.discount_factor * next_value\n\n self.optimize_actor([state, action, advantages])\n self.optimize_critic([state, target])", "execution_count": 11, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "64e1f7c58461094e08bdebaab5cc4056c20cbf92", "collapsed": true }, "cell_type": "code", "source": "state_size = 100\nstate_seq_length = 100\naction_size = 100", "execution_count": 12, "outputs": [] }, { "metadata": { "trusted": true, "collapsed": true, "_uuid": "dd9f5a8d682bf91161b7b8967c885219bc0b98a4" }, "cell_type": "code", "source": "import time", "execution_count": 13, "outputs": [] }, { "metadata": { "trusted": true, "_uuid": "2544132fb716e5567889e80bfbd34be798e89224", "scrolled": true, "collapsed": true }, "cell_type": "code", "source": "def run_experiment():\n start = time.time()\n env = TradeEnv()\n agent = A2CAgent(state_size, state_seq_length, action_size)\n epochs = 10\n reward_hist = []\n\n print('Setup: {:.4f}'.format(time.time() - for e in range(epochs):

        start = time.time()
        state = env.reset()
        state = np.reshape(state, [1,state_seq_length, state_size])
        done = False
        total_reward = 0
        print('Game Start: {:.4f}'.format(time.time() - start))

        while not done:

            start = time.time()
            action = agent.get_action(state)
            print('Get Action: {:.4f}'.format(time.time() - start))

            start = time.time()
            next_state, reward, done, info = env.step(action)
            print('Step: {:.4f}'.format(time.time() - start))

            start = time.time()
            next_state = np.reshape(next_state, [1,state_seq_length, state_size])
            agent.train_model(state, action, reward, next_state, done)
            print('Train: {:.4f}'.format(time.time() - start))

            total_reward += reward
            state = next_state

        print(total_reward)
        reward_hist.append(total_reward)
    return reward_hist

# Running training takes very long

#import matplotlib.pyplot as plt
#reward_hist = run_experiment()
#plt.plot(reward_hist)