####################################################################### # Copyright (C) # # 2016 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # # 2016 Kenta Shimada(hyperkentakun@gmail.com) # # 2017 Aja Rangaswamy (aja004@gmail.com) # # 2019 Baochen Su (subaochen@126.com) # # Permission given to modify the code as long as you keep this # # declaration at the top # ####################################################################### import matplotlib import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import numpy as np import seaborn as sns from scipy.stats import poisson matplotlib.use('Agg') # maximum # of cars in each location MAX_CARS = 20 # maximum # of cars to move during night MAX_MOVE_OF_CARS = 5 # expectation for rental requests in first location REQUEST_FIRST_LOC = 3 # expectation for rental requests in second location REQUEST_SECOND_LOC = 4 # expectation for # of cars returned in first location RETURNS_FIRST_LOC = 3 # expectation for # of cars returned in second location RETURNS_SECOND_LOC = 2 DISCOUNT = 0.9 # credit earned by a car RENTAL_CREDIT = 10 # cost of moving a car MOVE_CAR_COST = 2 # all possible actions actions = np.arange(-MAX_MOVE_OF_CARS, MAX_MOVE_OF_CARS + 1) # An up bound for poisson distribution # If n is greater than this value, then the probability of getting n is truncated to 0 # POISSON_UPPER_BOUND = 11 # Probability for poisson distribution # @lam: lambda should be less than 10 for this function poisson_cache = dict() def poisson_probability(n, lam): global poisson_cache key = n * 10 + lam if key not in poisson_cache: poisson_cache[key] = poisson.pmf(n, lam) return poisson_cache[key] def expected_return(state, action, state_value, constant_returned_cars, POISSON_UPPER_BOUND): """ @state: [# of cars in first location, # of cars in second location] @action: positive if moving cars from first location to second location, negative if moving cars from second location to first location @stateValue: state value matrix @constant_returned_cars: if set True, model is simplified such that the # of cars returned in daytime becomes constant rather than a random value from poisson distribution, which will reduce calculation time and leave the optimal policy/value state matrix almost the same """ # initialize total return returns = 0.0 # cost for moving cars returns -= MOVE_CAR_COST * abs(action) # moving cars init_num_of_cars_first_location = min(state[0] - action, MAX_CARS) init_num_of_cars_second_location = min(state[1] + action, MAX_CARS) # go through all possible rental requests for request_first_loc in range(POISSON_UPPER_BOUND): for request_second_loc in range(POISSON_UPPER_BOUND): # probability for current combination of rental requests prob = poisson_probability(request_first_loc, REQUEST_FIRST_LOC) * \ poisson_probability(request_second_loc, REQUEST_SECOND_LOC) num_of_cars_first_loc = init_num_of_cars_first_location num_of_cars_second_loc = init_num_of_cars_second_location # valid rental requests should be less than actual # of cars valid_rental_first_loc = min(num_of_cars_first_loc, request_first_loc) valid_rental_second_loc = min(num_of_cars_second_loc, request_second_loc) # get credits for renting reward = (valid_rental_first_loc + valid_rental_second_loc) * RENTAL_CREDIT num_of_cars_first_loc -= valid_rental_first_loc num_of_cars_second_loc -= valid_rental_second_loc if constant_returned_cars: # get returned cars, those cars can be used for renting tomorrow returned_cars_first_loc = RETURNS_FIRST_LOC returned_cars_second_loc = RETURNS_SECOND_LOC num_of_cars_first_loc = min(num_of_cars_first_loc + returned_cars_first_loc, MAX_CARS) num_of_cars_second_loc = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS) returns += prob * (reward + DISCOUNT * state_value[num_of_cars_first_loc, num_of_cars_second_loc]) else: for returned_cars_first_loc in range(POISSON_UPPER_BOUND): for returned_cars_second_loc in range(POISSON_UPPER_BOUND): prob_return = poisson_probability(returned_cars_first_loc, RETURNS_FIRST_LOC) *\ poisson_probability(returned_cars_second_loc, RETURNS_SECOND_LOC) num_of_cars_first_loc_ = min(num_of_cars_first_loc + returned_cars_first_loc, MAX_CARS) num_of_cars_second_loc_ = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS) # prob是request(出租)某个数量的概率,prob_return是return(归还)某个数量的概率 # 两者的乘积既考虑了出租的影响,也考虑了归还的影响(对num_of_cars_of_first_loc的影响) # 从数学的观点来看,这是一个条件概率问题(如何更清晰的数学表达?) prob_ = prob_return * prob returns += prob_ * (reward + DISCOUNT * state_value[num_of_cars_first_loc_, num_of_cars_second_loc_]) return returns def figure_4_2(POISSON_UPPER_BOUND, constant_returned_cars=True): value = np.zeros((MAX_CARS + 1, MAX_CARS + 1)) policy = np.zeros(value.shape, dtype=np.int) iterations = 0 supfig, axes = plt.subplots(2, 3, figsize=(40, 20)) supfig.suptitle(f'policy iteration[POISSON_UPPER_BOUND={POISSON_UPPER_BOUND}]', fontsize=30) plt.subplots_adjust(wspace=0.1, hspace=0.2) axes = axes.flatten() while True: fig = sns.heatmap(np.flipud(policy), cmap="YlGnBu", ax=axes[iterations]) fig.set_ylabel('# cars at first location', fontsize=30) fig.set_yticks(list(reversed(range(MAX_CARS + 1)))) fig.set_xlabel('# cars at second location', fontsize=30) fig.set_title('$\pi_{}$'.format(iterations), fontsize=30) # policy evaluation (in-place) while True: old_value = value.copy() for i in range(MAX_CARS + 1): for j in range(MAX_CARS + 1): new_state_value = expected_return([i, j], policy[i, j], value, constant_returned_cars, POISSON_UPPER_BOUND) value[i, j] = new_state_value max_value_change = abs(old_value - value).max() print('max value change {}'.format(max_value_change)) if max_value_change < 1e-4: break # policy improvement print(f'policy improvement on policy:{policy}') policy_stable = True for i in range(MAX_CARS + 1): for j in range(MAX_CARS + 1): old_action = policy[i, j] action_returns = [] for action in actions: if (0 <= action <= i) or (-j <= action <= 0): action_returns.append(expected_return([i, j], action, value, constant_returned_cars, POISSON_UPPER_BOUND)) else: action_returns.append(-np.inf) # 这里是关键所在:寻找使得value function最大化的action new_action = actions[np.argmax(action_returns)] policy[i, j] = new_action if policy_stable and old_action != new_action: policy_stable = False print('policy stable {}'.format(policy_stable)) if policy_stable: fig = sns.heatmap(np.flipud(value), cmap="YlGnBu", ax=axes[-1]) fig.set_ylabel('# cars at first location', fontsize=30) fig.set_yticks(list(reversed(range(MAX_CARS + 1)))) fig.set_xlabel('# cars at second location', fontsize=30) fig.set_title('optimal value', fontsize=30) plt.savefig(f'../images/rl/dp/figure_4_2_{POISSON_UPPER_BOUND}.png') # 也画出三维曲面图 plot_3d_value(value) break iterations += 1 plt.close() def plot_3d_value(value): fig_3d = plt.figure() # 定义新的三维坐标轴 ax3 = Axes3D(fig_3d) # 定义三维数据 xx = np.arange(0, MAX_CARS, 1) yy = np.arange(MAX_CARS, 0, -1) X, Y = np.meshgrid(xx, yy) # python在这里表现的很智能! Z = value[X, Y] surf = ax3.plot_surface(X, Y, Z, cmap='rainbow', linewidth=0, antialiased=False) fig_3d.colorbar(surf, shrink=0.5, aspect=5) ax3.set_title('optimal value', fontsize=20) ax3.set_xlabel('# cars at second location', fontsize=10) ax3.set_ylabel('# cars at first location', fontsize=10) plt.savefig(f'../images/rl/dp/figure_4_2_3d_value_{POISSON_UPPER_BOUND}.png') if __name__ == '__main__': for POISSON_UPPER_BOUND in np.arange(7, 20, 1): figure_4_2(POISSON_UPPER_BOUND)