#######################################################################
# Copyright (C)                                                       #
# 2016 Shangtong Zhang(zhangshangtong.cpp@gmail.com)                  #
# 2016 Kenta Shimada(hyperkentakun@gmail.com)                         #
# 2017 Aja Rangaswamy (aja004@gmail.com)                              #
# 2019 Baochen Su (subaochen@126.com)                                 #
# Permission given to modify the code as long as you keep this        #
# declaration at the top                                              #
#######################################################################

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import seaborn as sns
from scipy.stats import poisson

matplotlib.use('Agg')

# maximum # of cars in each location
MAX_CARS = 20

# maximum # of cars to move during night
MAX_MOVE_OF_CARS = 5

# expectation for rental requests in first location
REQUEST_FIRST_LOC = 3

# expectation for rental requests in second location
REQUEST_SECOND_LOC = 4

# expectation for # of cars returned in first location
RETURNS_FIRST_LOC = 3

# expectation for # of cars returned in second location
RETURNS_SECOND_LOC = 2

DISCOUNT = 0.9

# credit earned by a car
RENTAL_CREDIT = 10

# cost of moving a car
MOVE_CAR_COST = 2

# all possible actions
actions = np.arange(-MAX_MOVE_OF_CARS, MAX_MOVE_OF_CARS + 1)

# An up bound for poisson distribution
# If n is greater than this value, then the probability of getting n is truncated to 0
# POISSON_UPPER_BOUND = 11

# Probability for poisson distribution
# @lam: lambda should be less than 10 for this function
poisson_cache = dict()


def poisson_probability(n, lam):
    global poisson_cache
    key = n * 10 + lam
    if key not in poisson_cache:
        poisson_cache[key] = poisson.pmf(n, lam)
    return poisson_cache[key]


def expected_return(state, action, state_value, constant_returned_cars, POISSON_UPPER_BOUND):
    """
    @state: [# of cars in first location, # of cars in second location]
    @action: positive if moving cars from first location to second location,
            negative if moving cars from second location to first location
    @stateValue: state value matrix
    @constant_returned_cars:  if set True, model is simplified such that
    the # of cars returned in daytime becomes constant
    rather than a random value from poisson distribution, which will reduce calculation time
    and leave the optimal policy/value state matrix almost the same
    """
    # initialize total return
    returns = 0.0

    # cost for moving cars
    returns -= MOVE_CAR_COST * abs(action)

    # moving cars
    init_num_of_cars_first_location = min(state[0] - action, MAX_CARS)
    init_num_of_cars_second_location = min(state[1] + action, MAX_CARS)

    # go through all possible rental requests
    for request_first_loc in range(POISSON_UPPER_BOUND):
        for request_second_loc in range(POISSON_UPPER_BOUND):
            # probability for current combination of rental requests
            prob = poisson_probability(request_first_loc, REQUEST_FIRST_LOC) * \
                poisson_probability(request_second_loc, REQUEST_SECOND_LOC)

            num_of_cars_first_loc = init_num_of_cars_first_location
            num_of_cars_second_loc = init_num_of_cars_second_location

            # valid rental requests should be less than actual # of cars
            valid_rental_first_loc = min(num_of_cars_first_loc, request_first_loc)
            valid_rental_second_loc = min(num_of_cars_second_loc, request_second_loc)

            # get credits for renting
            reward = (valid_rental_first_loc + valid_rental_second_loc) * RENTAL_CREDIT
            num_of_cars_first_loc -= valid_rental_first_loc
            num_of_cars_second_loc -= valid_rental_second_loc

            if constant_returned_cars:
                # get returned cars, those cars can be used for renting tomorrow
                returned_cars_first_loc = RETURNS_FIRST_LOC
                returned_cars_second_loc = RETURNS_SECOND_LOC
                num_of_cars_first_loc = min(num_of_cars_first_loc + returned_cars_first_loc, MAX_CARS)
                num_of_cars_second_loc = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS)
                returns += prob * (reward + DISCOUNT * state_value[num_of_cars_first_loc, num_of_cars_second_loc])
            else:
                for returned_cars_first_loc in range(POISSON_UPPER_BOUND):
                    for returned_cars_second_loc in range(POISSON_UPPER_BOUND):
                        prob_return = poisson_probability(returned_cars_first_loc, RETURNS_FIRST_LOC) *\
                            poisson_probability(returned_cars_second_loc, RETURNS_SECOND_LOC)
                        num_of_cars_first_loc_ = min(num_of_cars_first_loc + returned_cars_first_loc, MAX_CARS)
                        num_of_cars_second_loc_ = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS)
                        # prob是request（出租）某个数量的概率，prob_return是return（归还）某个数量的概率
                        # 两者的乘积既考虑了出租的影响，也考虑了归还的影响（对num_of_cars_of_first_loc的影响）
                        # 从数学的观点来看，这是一个条件概率问题（如何更清晰的数学表达？）
                        prob_ = prob_return * prob
                        returns += prob_ * (reward + DISCOUNT *
                                            state_value[num_of_cars_first_loc_, num_of_cars_second_loc_])
    return returns


def figure_4_2(POISSON_UPPER_BOUND, constant_returned_cars=True):
    value = np.zeros((MAX_CARS + 1, MAX_CARS + 1))
    policy = np.zeros(value.shape, dtype=np.int)

    iterations = 0
    supfig, axes = plt.subplots(2, 3, figsize=(40, 20))
    supfig.suptitle(f'policy iteration[POISSON_UPPER_BOUND={POISSON_UPPER_BOUND}]', fontsize=30)
    plt.subplots_adjust(wspace=0.1, hspace=0.2)
    axes = axes.flatten()
    while True:
        fig = sns.heatmap(np.flipud(policy), cmap="YlGnBu", ax=axes[iterations])
        fig.set_ylabel('# cars at first location', fontsize=30)
        fig.set_yticks(list(reversed(range(MAX_CARS + 1))))
        fig.set_xlabel('# cars at second location', fontsize=30)
        fig.set_title('$\pi_{}$'.format(iterations), fontsize=30)

        # policy evaluation (in-place)
        while True:
            old_value = value.copy()
            for i in range(MAX_CARS + 1):
                for j in range(MAX_CARS + 1):
                    new_state_value = expected_return([i, j], policy[i, j],
                        value, constant_returned_cars, POISSON_UPPER_BOUND)
                    value[i, j] = new_state_value
            max_value_change = abs(old_value - value).max()
            print('max value change {}'.format(max_value_change))
            if max_value_change < 1e-4:
                break

        # policy improvement
        print(f'policy improvement on policy:{policy}')
        policy_stable = True
        for i in range(MAX_CARS + 1):
            for j in range(MAX_CARS + 1):
                old_action = policy[i, j]
                action_returns = []
                for action in actions:
                    if (0 <= action <= i) or (-j <= action <= 0):
                        action_returns.append(expected_return([i, j], action,
                            value, constant_returned_cars, POISSON_UPPER_BOUND))
                    else:
                        action_returns.append(-np.inf)
                # 这里是关键所在：寻找使得value function最大化的action
                new_action = actions[np.argmax(action_returns)]
                policy[i, j] = new_action
                if policy_stable and old_action != new_action:
                    policy_stable = False
        print('policy stable {}'.format(policy_stable))

        if policy_stable:
            fig = sns.heatmap(np.flipud(value), cmap="YlGnBu", ax=axes[-1])
            fig.set_ylabel('# cars at first location', fontsize=30)
            fig.set_yticks(list(reversed(range(MAX_CARS + 1))))
            fig.set_xlabel('# cars at second location', fontsize=30)
            fig.set_title('optimal value', fontsize=30)
            plt.savefig(f'../images/rl/dp/figure_4_2_{POISSON_UPPER_BOUND}.png')

            # 也画出三维曲面图
            plot_3d_value(value)
            break

        iterations += 1

    plt.close()


def plot_3d_value(value):
    fig_3d = plt.figure()  # 定义新的三维坐标轴
    ax3 = Axes3D(fig_3d)

    # 定义三维数据
    xx = np.arange(0, MAX_CARS, 1)
    yy = np.arange(MAX_CARS, 0, -1)
    X, Y = np.meshgrid(xx, yy)
    # python在这里表现的很智能！
    Z = value[X, Y]

    surf = ax3.plot_surface(X, Y, Z, cmap='rainbow', linewidth=0, antialiased=False)
    fig_3d.colorbar(surf, shrink=0.5, aspect=5)
    ax3.set_title('optimal value', fontsize=20)
    ax3.set_xlabel('# cars at second location', fontsize=10)
    ax3.set_ylabel('# cars at first location', fontsize=10)
    plt.savefig(f'../images/rl/dp/figure_4_2_3d_value_{POISSON_UPPER_BOUND}.png')


if __name__ == '__main__':
    for POISSON_UPPER_BOUND in np.arange(7, 20, 1):
        figure_4_2(POISSON_UPPER_BOUND)