import numpy as np
import time

## Network architecture
NUM_INPUT = 784  # Number of input neurons
NUM_OUTPUT = 10  # Number of output neurons
NUM_CHECK = 5  # Number of examples on which to check the gradient

## Hyperparameters
NUM_HIDDEN = 100
LEARNING_RATE = 0.2
BATCH_SIZE = 64
NUM_EPOCH = 1000
#DECAY_RATE = 0.6
#DECAY_STEPS = 1000

WHETHER_TEST = False

print("NUM_HIDDEN: ", NUM_HIDDEN)
print("LEARNING_RATE: ", LEARNING_RATE)
print("BATCH_SIZE: ", BATCH_SIZE)
print("NUM_EPOCH: ", NUM_EPOCH)

# Given a vector w containing all the weights and biased vectors, extract
# and return the individual weights and biases W1, b1, W2, b2.
def unpack (w):
    W1 = np.reshape(w[:NUM_INPUT * NUM_HIDDEN],(NUM_INPUT,NUM_HIDDEN))
    w = w[NUM_INPUT * NUM_HIDDEN:]
    b1 = np.reshape(w[:NUM_HIDDEN], NUM_HIDDEN)
    w = w[NUM_HIDDEN:]
    W2 = np.reshape(w[:NUM_HIDDEN*NUM_OUTPUT], (NUM_HIDDEN,NUM_OUTPUT))
    w = w[NUM_HIDDEN*NUM_OUTPUT:]
    b2 = np.reshape(w,NUM_OUTPUT)
    return W1, b1, W2, b2

# Given individual weights and biases W1, b1, W2, b2, concatenate them and
# return a vector w containing all of them.
def pack (W1, b1, W2, b2):
    W1_ = np.reshape(W1,NUM_INPUT*NUM_HIDDEN)
    # print(W1_.shape)
    W2_ = np.reshape(W2,NUM_HIDDEN*NUM_OUTPUT)
    # print(W2_.shape)
    w = np.concatenate((W1_,b1, W2_, b2))
    # print(w.shape)
    return w

# Load the images and labels from a specified dataset (train or test).
def loadData (which):
    images = np.load("./data/mnist_{}_images.npy".format(which))
    labels = np.load("./data/mnist_{}_labels.npy".format(which))
    return images, labels

## 1. Forward Propagation
# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute and return the cross-entropy (CE) loss.

def softmax(x):
    softmax_x = np.exp(x)
    sum_softmax_x = np.sum(softmax_x)
    softmax_x = softmax_x / sum_softmax_x
    return softmax_x
    
 
def cross_entropy(predicted, ground_truth):
    log_predicted = np.log(predicted)
    loss_matrix = ground_truth * log_predicted
    loss = -np.sum(loss_matrix)
    return loss

def sgn(x):
    result = np.sign(x)
    #print(result)
    result = np.maximum(result,0)
    #print(result)
    return result

def fCE (X, Y, w):
    W1, b1, W2, b2 = unpack(w)
    loss = 0.0
    ## your code here
    global h1_array
    global z2_array
    global z1_array
    global y_predict_array
    h1_array = []
    z1_array = []
    z2_array = []
    y_predict_array = []
    for i in range(len(X)):
        z1 = W1.T.dot(X[i]) + b1
        h1 = np.maximum(z1,0)
        z2 = W2.T.dot(h1) + b2
        y_predict = softmax(z2)
        loss = loss + cross_entropy(y_predict, Y[i])
        h1_array.append(h1)
        z1_array.append(z1)
        z2_array.append(z2)
        y_predict_array.append(y_predict)
        if i == 0:
            if WHETHER_TEST == True:
                print("X:", X.shape)
                print("Y:", Y.shape)
                print("W1:", W1.shape)
                print("W2:", W2.shape)
                print("b1:", b1.shape)
                print("b2:", b2.shape)
                print("h1:", h1.shape)
                print("z1:", z1.shape)
                print("z2:", z2.shape)
                print("y.predict:", y_predict.shape)
    loss = loss / len(X)
    return loss

## 2. Backward Propagation
# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute and return the gradient of fCE. 
def gradCE (X, Y, w):
    W1, b1, W2, b2 = unpack(w)
    delta_W_1_average = None
    delta_W_2_average = None
    delta_b_1_average = None
    delta_b_2_average = None

    ## your code here
    for i in range(len(X)):
        delta_b_2 = y_predict_array[i] - Y[i]
        y_difference = delta_b_2.reshape(NUM_OUTPUT,1)
        delta_W_2 = y_difference.dot(h1_array[i].reshape(1,NUM_HIDDEN)).T
        a = W2.dot(delta_b_2)
        #print(a.shape)
        #print(np.array(sgn(z1_array[i]).shape))
        b = np.array(a) * np.array(sgn(z1_array[i]))
        delta_W_1 = b.reshape(NUM_HIDDEN,1).dot(X[i].reshape(1,NUM_INPUT))
        delta_b_1 = b
        if i == 0:
            if WHETHER_TEST == True:
                print("delta_W_1:",delta_W_1.shape)
                print("delta_b_1:",delta_b_1.shape)
                print("delta_W_2:",delta_W_2.shape)
                print("delta_b_2:",delta_b_2.shape)
                
            delta_W_1_average = delta_W_1
            delta_W_2_average = delta_W_2
            delta_b_1_average = delta_b_1
            delta_b_2_average = delta_b_2
        else:
            delta_W_1_average = delta_W_1_average + delta_W_1
            delta_W_2_average = delta_W_2_average + delta_W_2
            delta_b_1_average = delta_b_1_average + delta_b_1
            delta_b_2_average = delta_b_2_average + delta_b_2
    delta_W_1 = delta_W_1_average.T / len(X)
    delta_W_2 = delta_W_2_average / len(X)
    delta_b_1 = delta_b_1_average / len(X)
    delta_b_2 = delta_b_2_average / len(X)
    if WHETHER_TEST == True:
        print("delta_W_1:",delta_W_1.shape)
        print("delta_b_1:",delta_b_1.shape)
        print("delta_W_2:",delta_W_2.shape)
        print("delta_b_2:",delta_b_2.shape)
    delta = pack(delta_W_1, delta_b_1, delta_W_2, delta_b_2)
    return delta


def test(testX, testY, w):
    fCE(testX, testY, w)
    total_sum = 0
    for i in range(len(testX)):
        predict = y_predict_array[i]
        ground_truth = testY[i]
        predict_label = np.argmax(predict)
        ground_truth_label = np.argmax(ground_truth)
        if predict_label == ground_truth_label:
            total_sum = total_sum + 1
    success_rate = total_sum / len(testX) * 100
    return success_rate


## 3. Parameter Update
# Given training and testing datasets and an initial set of weights/biases,
# train the NN.
def train(trainX, trainY, testX, testY, w):
    ## your code here
    print("initial test: ", test(testX, testY, w), "%")
    
    for i in range(NUM_EPOCH):
        #更新学习率
        #real_learning_rate = LEARNING_RATE *(DECAY_RATE ** (i / DECAY_STEPS))
        
        
        train_batch_ID = np.arange(len(trainX))
        np.random.shuffle(train_batch_ID)
        train_batch_ID = train_batch_ID[0:BATCH_SIZE]
        #print(train_batch_ID) 
        train_batch_X = trainX[train_batch_ID]
        train_batch_Y = trainY[train_batch_ID]
                
        loss = fCE(train_batch_X, train_batch_Y, w)
        print("current epoch:", i)
        print("current loss:", loss)
        #print("current learning rate:", real_learning_rate)
        delta = gradCE(train_batch_X, train_batch_Y, w)
        w = w - LEARNING_RATE * delta
        if (i + 1) % 100 == 0:
            print("current test: ",test(trainX, trainY,w), "%")
            #最终训练准确率：95.79%
        
    print("final test: ", test(testX, testY, w), "%")
    #最终测试准确率：92.84%
    

if __name__ == "__main__":
    # Load data
    start_time = time.time()
    trainX, trainY = loadData("train")
    testX, testY = loadData("test")

    print("len(trainX): ", len(trainX))
    print("len(testX): ", len(testX))

    # Initialize weights randomly
    W1 = 2*(np.random.random(size=(NUM_INPUT, NUM_HIDDEN))/NUM_INPUT**0.5) - 1./NUM_INPUT**0.5
    b1 = 0.01 * np.ones(NUM_HIDDEN)
    W2 = 2*(np.random.random(size=(NUM_HIDDEN, NUM_OUTPUT))/NUM_HIDDEN**0.5) - 1./NUM_HIDDEN**0.5
    b2 = 0.01 * np.ones(NUM_OUTPUT)

    w = pack(W1, b1, W2, b2)
    print("Shape of w:",w.shape)

    # # Train the network and report the accuracy on the training and test set.
    train(trainX, trainY, testX, testY, w)