import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, layer_num, state_num=115, device='cpu', orthogonal=False):
        super().__init__()
        self.device = device
        self.model = [
            nn.Linear(state_num, 512).to(device),
            nn.ReLU(inplace=True).to(device)]
        for i in range(layer_num):
            l = nn.Linear(512, 512).to(device)
            if orthogonal:
                nn.init.orthogonal(l.weight)
            self.model += [l, nn.ReLU(inplace=True).to(device)]
        self.model = nn.Sequential(*self.model)

    def forward(self, s):
        if not isinstance(s, torch.Tensor):
            s = torch.tensor(s, device=self.device, dtype=torch.float)
        logits = self.model(s.to(self.device))
        return logits


class Actor(nn.Module):
    def __init__(self, layer_num, state_num=115, action_num=19, device='cpu', orthogonal=False):
        super().__init__()
        self.preprocess = Net(layer_num, state_num, device, orthogonal)
        self.last = nn.Linear(512, action_num).to(device)
        if orthogonal:
            nn.init.orthogonal(self.last.weight)

    def forward(self, s, noise=None):
        logits = self.last(self.preprocess(s))
        return logits


class Critic(nn.Module):
    def __init__(self, layer_num, state_num=115, device='cpu', orthogonal=False):
        super().__init__()
        self.preprocess = Net(layer_num, state_num, device, orthogonal)
        self.last = nn.Linear(512, 1).to(device)
        if orthogonal:
            nn.init.orthogonal(self.last.weight)

    def forward(self, s, a=None):
        if a is not None:
            if len(a.shape) is 1:
                a = a.unsqueeze(1)
            s = torch.cat([s, a], 1)
        logits = self.preprocess(s)
        logits = self.last(logits)
        return logits


class DDPG(object):
    def __init__(self, batch_size=32, state_num=115, action_num=19, device='cpu', memory_capacity=100000,
                 eps=0.1, gamma=0.9, critic_lr=0.0001, actor_lr=0.0001, tau=0.005):

        self.actor = Actor(3, state_num, action_num, device)
        self.actor_target = Actor(3, state_num, action_num, device)
        self.critic = Critic(3, state_num + action_num, device)
        self.critic_target = Critic(3, state_num + action_num, device)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optim = optim.Adam(self.critic.parameters(), lr=critic_lr)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.tau = tau
        self.batch_size = batch_size
        self.memory_capacity = memory_capacity
        self.state_num = state_num
        self.action_num = action_num
        self.learn_step = 0
        self.memory_pts = 0
        self.eps = eps
        self.gamma = gamma
        self.memory = np.zeros((memory_capacity, state_num * 2 + action_num + 1))
        self.loss = nn.MSELoss()
        self.device = device

    def choose_action(self, s):
        s = Variable(torch.unsqueeze(torch.FloatTensor(s), 0))
        logits = self.actor(s)
        logits = F.gumbel_softmax(logits, dim=-1)
        if np.random.uniform() > self.eps:

            action = torch.distributions.Categorical(logits).sample()
            # action = torch.argmax(logits)
        else:
            action = torch.randint(0, self.action_num, ()).to(self.device)
        return action, logits.squeeze(0).detach().cpu().numpy()

    def store_transition(self, s, a, r, next_s):
        trans = np.hstack((s, a, r, next_s))

        i = self.memory_pts % self.memory_capacity
        self.memory[i, :] = trans
        self.memory_pts += 1

    def learn(self):
        self.learn_step += 1

        sample_index = np.random.choice(self.memory_capacity, self.batch_size)
        batch_memory = self.memory[sample_index, :]

        # in the memory, the 1st---4th column is state_now , the 5th is action , the 6th is reward
        # the final 4 column is state_next
        batch_s = Variable(torch.FloatTensor(batch_memory[:, :self.state_num])).to(self.device)
        batch_a = Variable(
            torch.LongTensor(batch_memory[:, self.state_num:self.state_num + self.action_num])).to(
            self.device)
        batch_r = Variable(torch.FloatTensor(
            batch_memory[:, self.state_num + self.action_num:self.state_num + self.action_num + 1])).to(self.device)
        batch_next_s = Variable(torch.FloatTensor(batch_memory[:, -self.state_num:])).to(self.device)

        batch_next_a_logits = self.actor_target(batch_next_s)
        batch_target_next_a = F.gumbel_softmax(batch_next_a_logits, dim=-1)

        y_true = batch_r + self.gamma * self.critic_target(batch_next_s, batch_target_next_a).detach()

        y_pred = self.critic(batch_s, batch_a.float())

        critic_loss = self.loss(y_pred, y_true)
        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

        batch_a_logits = self.actor(batch_s)
        batch_target_a = F.gumbel_softmax(batch_a_logits, dim=-1)

        actor_loss = -torch.mean(
            self.critic(batch_s, batch_target_a))
        self.actor_optim.zero_grad()
        actor_loss.backward()
        self.actor_optim.step()

        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)

        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)

        return critic_loss.item(), actor_loss.item()


class TD3(DDPG):

    def __init__(self, batch_size=32, state_num=115, action_num=19, device='cpu', memory_capacity=100000,
                 update_freq=2, eps=0.1, gamma=0.9, critic_lr=0.0001, actor_lr=0.0001, policy_noise=0.2,
                 noise_clip=0.5, tau=0.005):

        super(TD3, self).__init__(batch_size, state_num, action_num, device, memory_capacity,
                                  eps, gamma, critic_lr, actor_lr, tau)
        self.critic_2 = Critic(3, state_num + action_num, device)
        self.critic_target_2 = Critic(3, state_num + action_num, device)
        self.critic_optim_2 = optim.Adam(self.critic_2.parameters(), lr=critic_lr)
        self.critic_target_2.load_state_dict(self.critic_2.state_dict())
        self.policy_noise = policy_noise
        self.update_freq = update_freq
        self.noise_clip = noise_clip
        self.a_loss = 0
        self.learn_step = 0
        self.device = device

    def learn(self):
        self.learn_step += 1

        sample_index = np.random.choice(self.memory_capacity, self.batch_size)
        batch_memory = self.memory[sample_index, :]

        # in the memory, the 1st---4th column is state_now , the 5th is action , the 6th is reward
        # the final 4 column is state_next
        batch_s = Variable(torch.FloatTensor(batch_memory[:, :self.state_num])).to(self.device)
        batch_a = Variable(torch.LongTensor(batch_memory[:, self.state_num:self.state_num + self.action_num])).to(
            self.device)
        batch_r = Variable(torch.FloatTensor(
            batch_memory[:, self.state_num + self.action_num:self.state_num + self.action_num + 1])).to(self.device)
        batch_next_s = Variable(torch.FloatTensor(batch_memory[:, -self.state_num:])).to(self.device)
        batch_next_a_logits = self.actor_target(batch_next_s)
        batch_target_next_a = F.gumbel_softmax(batch_next_a_logits, dim=-1)

        noise = torch.randn(size=batch_next_a_logits.shape, device=self.device) * self.policy_noise
        if self.noise_clip >= 0:
            noise = noise.clamp(-self.noise_clip, self.noise_clip)
            batch_target_next_a += noise

        target_q = torch.min(
            self.critic_target(batch_next_s, batch_target_next_a),
            self.critic_target_2(batch_next_s, batch_target_next_a))

        y_true = batch_r + self.gamma * target_q.detach()

        y_pred = self.critic(batch_s, batch_a.float())
        y_pred_2 = self.critic_2(batch_s, batch_a.float())

        critic_loss = self.loss(y_pred, y_true)
        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

        critic_loss_2 = self.loss(y_pred_2, y_true)
        self.critic_optim_2.zero_grad()
        critic_loss_2.backward()
        self.critic_optim_2.step()

        if self.learn_step % self.update_freq == 0:

            batch_a_logits = self.actor(batch_s)
            batch_target_a = F.gumbel_softmax(batch_a_logits, dim=-1)

            actor_loss = -torch.mean(
                self.critic(batch_s, batch_target_a))
            self.actor_optim.zero_grad()
            actor_loss.backward()
            self.actor_optim.step()

            for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
                target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)

            for target_param, param in zip(self.critic_target_2.parameters(), self.critic_2.parameters()):
                target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)

            for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
                target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
            self.a_loss = actor_loss.item()

        return critic_loss.item(), critic_loss_2.item(), self.a_loss


class PPO(object):
    def __init__(self, batch_size=32, state_num=115, action_num=19, device='cpu', memory_capacity=2048,
                 eps=0.1, gamma=0.9, lr=0.0001, clip=0.2, loss_coeff_value=0.5, loss_coeff_entropy=0.01, lamda=0.95,
                 clip_grad_norm=0.5, optim_epoch=10):

        self.actor = Actor(3, state_num, action_num, device, orthogonal=False)
        self.critic = Critic(3, state_num + action_num, device, orthogonal=False)
        self.optim = optim.Adam([{'params': self.actor.parameters()},
                                 {'params': self.critic.parameters()}], lr=lr)

        self.batch_size = batch_size
        self.state_num = state_num
        self.action_num = action_num
        self.memory_pts = 0
        self.eps = eps
        self.gamma = gamma
        self.clip = clip
        self.clip_grad_norm = clip_grad_norm
        self.loss_coeff_value = loss_coeff_value
        self.loss_coeff_entropy = loss_coeff_entropy
        self.lamda = lamda
        self.memory = np.zeros((memory_capacity, state_num + action_num + 4))
        self.memory_capacity = memory_capacity
        self.device = device
        self.loss = nn.MSELoss()
        self.optim_epoch = optim_epoch

    def choose_action(self, s):
        s = Variable(torch.unsqueeze(torch.FloatTensor(s), 0)).to(self.device)
        log_prob = -2.94
        logits = self.actor(s)
        logits = F.gumbel_softmax(logits, dim=-1)
        # logits = F.softmax(logits, dim=-1)
        if np.random.uniform() > self.eps:
            dist = torch.distributions.Categorical(logits)
            action = dist.sample()
            # action = torch.argmax(logits)
            log_prob = dist.log_prob(action).detach()
            # log_prob = torch.log(logits.squeeze(0)[action]).detach().cpu().numpy()
        else:
            action = torch.randint(0, self.action_num, ()).to(self.device)
        return action, log_prob, logits.squeeze(0).detach().cpu().numpy()

    def get_logproba(self, states, actions):
        logits = self.actor(states)
        logits = F.gumbel_softmax(logits, dim=-1)
        # logits = F.softmax(logits, dim=-1)
        dist = torch.distributions.Categorical(logits)
        if len(actions.shape) is 2:
            logproba = dist.log_prob(actions.squeeze(0))
        else:
            logproba = dist.log_prob(actions).detach().cpu().numpy()
        return logproba

    def store_transition(self, s, a, logits, r, done, logprob):
        trans = np.hstack((s, a, logits, r, done, logprob))

        i = self.memory_pts % self.memory_capacity
        self.memory[i, :] = trans
        self.memory_pts += 1

    def if_full(self):
        return self.memory_pts >= self.memory_capacity

    def reset_buffer(self):
        self.memory_pts = 0
        self.memory = np.zeros((self.memory_capacity, self.state_num + self.action_num + 4))

    def learn(self):

        batch_memory = self.memory
        total_size = batch_memory.shape[0]
        # in the memory, the 1st---4th column is state_now , the 5th is action , the 6th is reward
        # the final 4 column is state_next
        batch_s = Variable(torch.FloatTensor(batch_memory[:, :self.state_num])).to(self.device)
        batch_a = Variable(
            torch.LongTensor(batch_memory[:, self.state_num:self.state_num + 1])).to(
            self.device)
        batch_logits = Variable(
            torch.LongTensor(batch_memory[:, self.state_num + 1:self.state_num + self.action_num + 1])).to(
            self.device).float()
        batch_r = Variable(torch.FloatTensor(
            batch_memory[:, self.state_num + self.action_num + 1:self.state_num + self.action_num + 2])).to(self.device)
        batch_v = self.critic(batch_s, batch_logits).detach()

        batch_mask = Variable(torch.FloatTensor(
            batch_memory[:, self.state_num + self.action_num + 2:self.state_num + self.action_num + 3])).to(self.device)
        batch_old_logprob = Variable(torch.FloatTensor(
            batch_memory[:, self.state_num + self.action_num + 3:self.state_num + self.action_num + 4])).to(
            self.device)

        batch_returns = torch.Tensor(total_size).to(self.device)
        batch_delta = torch.Tensor(total_size).to(self.device)
        batch_advantage = torch.Tensor(total_size).to(self.device)

        prev_return = 0
        prev_value = 0
        prev_advantage = 0
        for i in reversed(range(total_size)):
            batch_returns[i] = batch_r[i] + self.gamma * prev_return * batch_mask[i]
            batch_delta[i] = batch_r[i] + self.gamma * prev_value * batch_mask[i] - batch_v[i]
            batch_advantage[i] = batch_delta[i] + self.gamma * self.lamda * prev_advantage * batch_mask[i]

            prev_return = batch_returns[i]
            prev_value = batch_v[i]
            prev_advantage = batch_advantage[i]

        batch_advantage = (batch_advantage - batch_advantage.mean()) / (batch_advantage.std() + 1e-10)

        return_loss = 0
        for _ in range(self.optim_epoch):
            for _ in range(int(total_size / self.batch_size)):
                # sample from current batch
                minibatch_ind = np.random.choice(total_size, self.batch_size, replace=False)
                minibatch_states = batch_s[minibatch_ind]
                minibatch_actions = batch_a[minibatch_ind]
                minibatch_logits = batch_logits[minibatch_ind]
                minibatch_oldlogproba = batch_old_logprob[minibatch_ind]
                minibatch_newlogproba = self.get_logproba(minibatch_states, minibatch_actions)
                minibatch_advantage = batch_advantage[minibatch_ind]
                minibatch_returns = batch_returns[minibatch_ind]
                minibatch_newvalues = self.critic(minibatch_states, minibatch_logits)

                ratio = torch.exp(minibatch_newlogproba - minibatch_oldlogproba).squeeze(1)
                surr1 = ratio * minibatch_advantage
                surr2 = ratio.clamp(1 - self.clip, 1 + self.clip) * minibatch_advantage
                loss_surr = - torch.mean(torch.min(surr1, surr2))

                loss_value = torch.mean((minibatch_newvalues - minibatch_returns).pow(2))

                loss_entropy = torch.mean(torch.exp(minibatch_newlogproba) * minibatch_newlogproba)

                total_loss = loss_surr + self.loss_coeff_value * loss_value + self.loss_coeff_entropy * loss_entropy
                self.optim.zero_grad()
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.clip_grad_norm)
                torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.clip_grad_norm)

                self.optim.step()
                return_loss += total_loss.item()

        return return_loss