import argparse, time, logging, os, math, sys

import numpy as np
import mxnet as mx
from mxnet import gluon, nd
from mxnet import autograd as ag
from mxnet.gluon import nn
from mxnet.gluon.data.vision import transforms
from mxnet.contrib.quantization import *

import gluoncv as gcv
gcv.utils.check_version('0.6.0')
from gluoncv.data import mscoco
from gluoncv.model_zoo import get_model
from gluoncv.utils import makedirs
from gluoncv.nn.block import DSNT
from gluoncv.data.transforms.pose import transform_preds, get_final_preds, flip_heatmap
from gluoncv.data.transforms.presets.simple_pose import SimplePoseDefaultTrainTransform, SimplePoseDefaultValTransform
from gluoncv.utils.metrics.coco_keypoints import COCOKeyPointsMetric

# CLI
parser = argparse.ArgumentParser(description='Validate a model for pose estimation.')
parser.add_argument('--data-dir', type=str, default='~/.mxnet/datasets/coco',
                    help='training and validation pictures to use.')
parser.add_argument('--num-joints', type=int, required=True,
                    help='Number of joints to detect')
parser.add_argument('--batch-size', type=int, default=32,
                    help='training batch size per device (CPU/GPU).')
parser.add_argument('--model-prefix', type=str, required=False,
                    help='load static model as hybridblock.')
parser.add_argument('--deploy', action='store_true',
                    help='whether load static model for deployment')
parser.add_argument('--quantized', action='store_true', 
                    help='whether to use int8 pretrained  model')
parser.add_argument('--num-iterations', type=int, default=100,
                    help='number of benchmarking iterations.')
parser.add_argument('--num-gpus', type=int, default=0,
                    help='number of gpus to use.')
parser.add_argument('-j', '--num-data-workers', dest='num_workers', default=4, type=int,
                    help='number of preprocessing workers')
parser.add_argument('--model', type=str, required=True,
                    help='type of model to use. see vision_model for options.')
parser.add_argument('--input-size', type=str, default='256,192',
                    help='size of the input image size. default is 256,192')
parser.add_argument('--params-file', type=str,
                    help='local parameters to load.')
parser.add_argument('--flip-test', action='store_true',
                    help='Whether to flip test input to ensemble results.')
parser.add_argument('--dsnt', action='store_true',
                    help='Whether to use dsnt to approximate coordinates.')
parser.add_argument('--mean', type=str, default='0.485,0.456,0.406',
                    help='mean vector for normalization')
parser.add_argument('--std', type=str, default='0.229,0.224,0.225',
                    help='std vector for normalization')
parser.add_argument('--score-threshold', type=float, default=0,
                    help='threshold value for predicted score.')
# dummy benchmark
parser.add_argument('--benchmark', action='store_true',
                    help='whether to use dummy data for benchmarking performance.')
# calibration
parser.add_argument('--calibration', action='store_true',
                    help='quantize model')
parser.add_argument('--num-calib-batches', type=int, default=5,
                    help='number of batches for calibration')
parser.add_argument('--quantized-dtype', type=str, default='auto', 
                    choices=['auto', 'int8', 'uint8'],
                    help='quantization destination data type for input data')
parser.add_argument('--calib-mode', type=str, default='naive',
                    help='calibration mode used for generating calibration table for the quantized symbol; supports'
                        ' 1. none: no calibration will be used. The thresholds for quantization will be calculated'
                        ' on the fly. This will result in inference speed slowdown and loss of accuracy'
                        ' in general.'
                        ' 2. naive: simply take min and max values of layer outputs as thresholds for'
                        ' quantization. In general, the inference accuracy worsens with more examples used in'
                        ' calibration. It is recommended to use `entropy` mode as it produces more accurate'
                        ' inference results.'
                        ' 3. entropy: calculate KL divergence of the fp32 output and quantized output for optimal'
                        ' thresholds. This mode is expected to produce the best inference accuracy of all three'
                        ' kinds of quantized models if the calibration dataset is representative enough of the'
                        ' inference dataset.')
opt = parser.parse_args()

logging.basicConfig()
logger = logging.getLogger('logger')
logger.setLevel(logging.INFO)
logging.info(opt)

batch_size = opt.batch_size
num_joints = 17

num_gpus = opt.num_gpus
context = [mx.cpu()]
if num_gpus > 0:
    batch_size *= max(1, num_gpus)
    context = [mx.gpu(i) for i in range(num_gpus)]

num_workers = opt.num_workers

def get_data_loader(data_dir, batch_size, num_workers, input_size):

    def val_batch_fn(batch, ctx):
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx,
                                          batch_axis=0, even_split=False)
        scale = batch[1]
        center = batch[2]
        score = batch[3]
        imgid = batch[4]
        return data, scale, center, score, imgid

    val_dataset = mscoco.keypoints.COCOKeyPoints(data_dir, splits=('person_keypoints_val2017'))

    meanvec = [float(i) for i in opt.mean.split(',')]
    stdvec = [float(i) for i in opt.std.split(',')]
    transform_val = SimplePoseDefaultValTransform(num_joints=val_dataset.num_joints,
                                                  joint_pairs=val_dataset.joint_pairs,
                                                  image_size=input_size,
                                                  mean=meanvec,
                                                  std=stdvec)
    val_data = gluon.data.DataLoader(
        val_dataset.transform(transform_val),
        batch_size=batch_size, shuffle=False, last_batch='keep',
        num_workers=num_workers)

    return val_dataset, val_data, val_batch_fn

input_size = [int(i) for i in opt.input_size.split(',')]

if opt.calibration or not opt.benchmark:
    val_dataset, val_data, val_batch_fn = get_data_loader(opt.data_dir, batch_size,
                                                          num_workers, input_size)
    val_metric = COCOKeyPointsMetric(val_dataset, 'coco_keypoints',
                                     data_shape=tuple(input_size),
                                     in_vis_thresh=opt.score_threshold)

use_pretrained = True if not opt.params_file else False
model_name = opt.model if not opt.quantized else '_'.join([opt.model, 'int8'])

if not opt.deploy:
    net = get_model(model_name, ctx=context, num_joints=num_joints, pretrained=use_pretrained)
    if not use_pretrained:
        net.load_parameters(opt.params_file, ctx=context)
    if opt.quantized:
        net.hybridize(static_alloc=True, static_shape=True)
    else:
        net.hybridize()
else:
    model_name = 'deploy'
    net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(opt.model_prefix),
              ['data'], '{}-0000.params'.format(opt.model_prefix))
    net.hybridize(static_alloc=True, static_shape=True)

print("Inference on model {} started!".format(model_name))

# calibration on FP32 model
def calibration(net, val_data, opt, ctx, logger):
    exclude_sym_layer = []
    exclude_match_layer = []
    if num_gpus > 0:
        raise ValueError('currently only supports CPU with MKL-DNN backend')
    net = quantize_net(net, calib_data=val_data, quantized_dtype=opt.quantized_dtype, calib_mode=opt.calib_mode, 
                       exclude_layers=exclude_sym_layer, num_calib_examples=opt.batch_size * opt.num_calib_batches,
                       exclude_layers_match=exclude_match_layer, ctx=ctx, logger=logger)
    dir_path = os.path.dirname(os.path.realpath(__file__))
    dst_dir = os.path.join(dir_path, 'model')
    if not os.path.isdir(dst_dir):
        os.mkdir(dst_dir)
    prefix = os.path.join(dst_dir, opt.model + '-quantized-' + opt.calib_mode)
    logger.info('Saving quantized model at %s' % dst_dir)
    net.export(prefix, epoch=0)


if opt.dsnt:
    heatmap_size = [int(i/4) for i in input_size]
    net_dsnt = DSNT(size=heatmap_size[::-1])
    net_dsnt.initialize(ctx=context)
    net_dsnt.hybridize()

def validate(val_data, val_dataset, net, ctx):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]

    val_metric.reset()

    from tqdm import tqdm
    for batch in tqdm(val_data):
        data, scale, center, score, imgid = val_batch_fn(batch, ctx)

        outputs = [net(X) for X in data]
        if opt.flip_test:
            data_flip = [nd.flip(X, axis=3) for X in data]
            outputs_flip = [net(X) for X in data_flip]
            outputs_flipback = [flip_heatmap(o, val_dataset.joint_pairs, shift=True) for o in outputs_flip]
            outputs = [(o + o_flip)/2 for o, o_flip in zip(outputs, outputs_flipback)]

        if opt.dsnt:
            outputs = [net_dsnt(X)[0] for X in outputs]

        if len(outputs) > 1:
            outputs_stack = nd.concat(*[o.as_in_context(mx.cpu()) for o in outputs], dim=0)
        else:
            outputs_stack = outputs[0].as_in_context(mx.cpu())

        if opt.dsnt:
            preds = (outputs_stack - 0.5) * scale.expand_dims(axis=1) + center.expand_dims(axis=1)
            maxvals = nd.ones(preds.shape[0:2]+(1, ))
        else:
            preds, maxvals = get_final_preds(outputs_stack, center.asnumpy(), scale.asnumpy())
        val_metric.update(preds, maxvals, score, imgid)

    metric_name, metric_score = val_metric.get()
    print("Inference Completed! %s = %.4f" % (metric_name, metric_score))
    return


def benchmarking(net, opt, ctx):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]

    bs = opt.batch_size
    num_iterations = opt.num_iterations
    input_shape = (bs, 3,) + tuple(input_size)
    size = num_iterations * bs
    data = mx.random.uniform(-1.0, 1.0, shape=input_shape, ctx=ctx[0], dtype='float32')
    dry_run = 5

    from tqdm import tqdm
    with tqdm(total=size + dry_run * bs) as pbar:
        for n in range(dry_run + num_iterations):
            if n == dry_run:
                tic = time.time()
            output = net(data)
            output.wait_to_read()
            pbar.update(bs)
    speed = size / (time.time() - tic)
    print('With batch size %d , %d batches, throughput is %f imgs/sec' % (bs, num_iterations, speed))


if __name__ == '__main__':
    if opt.calibration:
        calibration(net, val_data, opt, context[0], logger)
        sys.exit()

    if opt.benchmark:
        print("---------- Benchmarking on %s model -------------" % model_name)
        benchmarking(net, opt, context)
    else:
        validate(val_data, val_dataset, net, context)