import argparse, os, math, time, sys

import mxnet as mx
import logging
from mxnet import gluon, nd, image
from mxnet.gluon.nn import Block, HybridBlock
from mxnet.gluon.data.vision import transforms
from mxnet.contrib.quantization import *

import gluoncv as gcv
gcv.utils.check_version('0.6.0')
from gluoncv.data import imagenet
from gluoncv.model_zoo import get_model

# CLI
def parse_args():
    parser = argparse.ArgumentParser(description='Train a model for image classification.')
    parser.add_argument('--data-dir', type=str, default='~/.mxnet/datasets/imagenet',
                        help='Imagenet directory for validation.')
    parser.add_argument('--rec-dir', type=str, default='',
                        help='recio directory for validation.')
    parser.add_argument('--batch-size', type=int, default=32,
                        help='training batch size per device (CPU/GPU).')
    parser.add_argument('--num-gpus', type=int, default=0,
                        help='number of gpus to use.')
    parser.add_argument('-j', '--num-data-workers', dest='num_workers', default=4, type=int,
                        help='number of preprocessing workers')
    parser.add_argument('--model', type=str, default='model', required=False,
                        help='type of model to use. see vision_model for options.')
    parser.add_argument('--deploy', action='store_true',
                        help='whether load static model for deployment')
    parser.add_argument('--model-prefix', type=str, required=False,
                        help='load static model as hybridblock.')
    parser.add_argument('--quantized', action='store_true',
                        help='use int8 pretrained model')
    parser.add_argument('--input-size', type=int, default=224,
                        help='input shape of the image, default is 224.')
    parser.add_argument('--num-batches', type=int, default=100,
                        help='run specified number of batches for inference')
    parser.add_argument('--benchmark', action='store_true',
                        help='use synthetic data to evalute benchmark')
    parser.add_argument('--crop-ratio', type=float, default=0.875,
                        help='The ratio for crop and input size, for validation dataset only')
    parser.add_argument('--params-file', type=str,
                        help='local parameter file to load, instead of pre-trained weight.')
    parser.add_argument('--dtype', type=str,
                        help='training data type')
    parser.add_argument('--use_se', action='store_true',
                        help='use SE layers or not in resnext. default is false.')
    parser.add_argument('--calibration', action='store_true',
                        help='quantize model')
    parser.add_argument('--num-calib-batches', type=int, default=5,
                        help='number of batches for calibration')
    parser.add_argument('--quantized-dtype', type=str, default='auto',
                        choices=['auto', 'int8', 'uint8'],
                        help='quantization destination data type for input data')
    parser.add_argument('--calib-mode', type=str, default='naive',
                        help='calibration mode used for generating calibration table for the quantized symbol; supports'
                             ' 1. none: no calibration will be used. The thresholds for quantization will be calculated'
                             ' on the fly. This will result in inference speed slowdown and loss of accuracy'
                             ' in general.'
                             ' 2. naive: simply take min and max values of layer outputs as thresholds for'
                             ' quantization. In general, the inference accuracy worsens with more examples used in'
                             ' calibration. It is recommended to use `entropy` mode as it produces more accurate'
                             ' inference results.'
                             ' 3. entropy: calculate KL divergence of the fp32 output and quantized output for optimal'
                             ' thresholds. This mode is expected to produce the best inference accuracy of all three'
                             ' kinds of quantized models if the calibration dataset is representative enough of the'
                             ' inference dataset.')
    opt = parser.parse_args()
    return opt

def benchmark(network, ctx, batch_size=64, image_size=224, num_iter=100, datatype='float32'):
    input_shape = (batch_size, 3) + (image_size, image_size)
    data = mx.random.uniform(-1.0, 1.0, shape=input_shape, ctx=ctx, dtype=datatype)
    dryrun = 5
    for i in range(num_iter+dryrun):
        if i == dryrun:
            tic = time.time()
        output = network(data)
        output.asnumpy()
    toc = time.time() - tic
    return toc

def test(network, ctx, val_data, mode='image'):
    acc_top1 = mx.metric.Accuracy()
    acc_top5 = mx.metric.TopKAccuracy(5)
    acc_top1.reset()
    acc_top5.reset()
    if not opt.rec_dir:
        num_batch = len(val_data)
    num = 0
    start = time.time()
    for i, batch in enumerate(val_data):
        if mode == 'image':
            data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        else:
            data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
        outputs = [network(X.astype(opt.dtype, copy=False)) for X in data]
        acc_top1.update(label, outputs)
        acc_top5.update(label, outputs)

        _, top1 = acc_top1.get()
        _, top5 = acc_top5.get()
        if not opt.rec_dir:
            print('%d / %d : %.8f, %.8f'%(i, num_batch, 1-top1, 1-top5))
        else:
            print('%d : %.8f, %.8f'%(i, 1-top1, 1-top5))
        num += batch_size
    end = time.time()
    speed = num / (end - start)
    print('Throughput is %f img/sec.'% speed)

    _, top1 = acc_top1.get()
    _, top5 = acc_top5.get()
    return (1-top1, 1-top5)

if __name__ == '__main__':
    opt = parse_args()
    logging.basicConfig()
    logger = logging.getLogger('logger')
    logger.setLevel(logging.INFO)
    logging.info(opt)

    batch_size = opt.batch_size
    classes = 1000

    num_gpus = opt.num_gpus
    if num_gpus > 0:
        batch_size *= num_gpus
    ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
    num_workers = opt.num_workers

    input_size = opt.input_size
    model_name = opt.model
    if opt.quantized:
        model_name = '_'.join((model_name, 'int8'))
    pretrained = True if not opt.params_file else False

    kwargs = {'ctx': ctx, 'pretrained': pretrained, 'classes': classes}
    if model_name.startswith('resnext'):
        kwargs['use_se'] = opt.use_se

    if opt.deploy:
        model_name = 'deploy'
        net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(opt.model_prefix),
              ['data'], '{}-0000.params'.format(opt.model_prefix))
        net.hybridize(static_alloc=True, static_shape=True)
    else:
        net = get_model(model_name, **kwargs)
        net.cast(opt.dtype)
        if opt.params_file:
            net.load_parameters(opt.params_file, ctx=ctx)
        if opt.quantized:
            net.hybridize(static_alloc=True, static_shape=True)
        else:
            net.hybridize()

    normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])

    """
    Aligning with TF implementation, the default crop-input
    ratio set as 0.875; Set the crop as ceil(input-size/ratio)
    """
    crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875
    resize = int(math.ceil(input_size/crop_ratio))

    transform_test = transforms.Compose([
        transforms.Resize(resize, keep_ratio=True),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        normalize
    ])

    if not opt.benchmark:
        if not opt.rec_dir:
            val_data = gluon.data.DataLoader(
                imagenet.classification.ImageNet(opt.data_dir, train=False).transform_first(transform_test),
                batch_size=batch_size, shuffle=False, num_workers=num_workers)
        else:
            imgrec = os.path.join(opt.rec_dir, 'val.rec')
            imgidx = os.path.join(opt.rec_dir, 'val.idx')
            val_data = mx.io.ImageRecordIter(
                path_imgrec         = imgrec,
                path_imgidx         = imgidx,
                preprocess_threads  = num_workers,
                batch_size          = batch_size,

                resize              = resize,
                data_shape          = (3, input_size, input_size),
                mean_r              = 123.68,
                mean_g              = 116.779,
                mean_b              = 103.939,
                std_r               = 58.393,
                std_g               = 57.12,
                std_b               = 57.375
            )

    if opt.calibration and not opt.quantized:
        exclude_layers = []
        exclude_layers_match = ['flatten']
        logger.info('quantize net with batch size = %d', batch_size)
        if num_gpus > 0:
            raise ValueError('currently only supports CPU with MKL-DNN backend')
        net = quantize_net(
            net, quantized_dtype='auto', exclude_layers=exclude_layers,
            exclude_layers_match=exclude_layers_match, calib_data=val_data,
            calib_mode=opt.calib_mode, num_calib_examples=batch_size * opt.num_calib_batches, ctx=ctx[0],
            logger=logger)
        dir_path = os.path.dirname(os.path.realpath(__file__))
        dst_dir = os.path.join(dir_path, 'model')
        if not os.path.isdir(dst_dir):
            os.mkdir(dst_dir)
        prefix = os.path.join(dst_dir, model_name +
                              '-quantized-' + opt.calib_mode)
        logger.info('Saving quantized model at %s' % dir_path)
        net.export(prefix, epoch=0)
        net.hybridize(static_alloc=True, static_shape=True)
        sys.exit()

    if opt.benchmark:
        print('-----benchmark mode for model %s-----'%model_name)
        time_cost = benchmark(network=net, ctx=ctx[0], image_size=opt.input_size, batch_size=opt.batch_size,
            num_iter=opt.num_batches, datatype='float32')
        fps = (opt.batch_size*opt.num_batches)/time_cost
        print('With batch size %s, %s batches, inference performance is %.2f img/sec' % (opt.batch_size, opt.num_batches, fps))
        sys.exit()

    if not opt.rec_dir:
        err_top1_val, err_top5_val = test(net, ctx, val_data, 'image')
    else:
        err_top1_val, err_top5_val = test(net, ctx, val_data, 'rec')
    print(err_top1_val, err_top5_val)

    params_count = 0
    kwargs2 = {'ctx': mx.cpu(), 'pretrained': False, 'classes': classes}
    net2 = get_model(model_name, **kwargs2)
    net2.initialize()
    p = net2(mx.nd.zeros((1, 3, input_size, input_size)))
    for k, v in net2.collect_params().items():
        params_count += v.data().size

    print(params_count)