import torch import torchvision import random import time import argparse import os import sys import math import torch.nn as nn import torch.multiprocessing as mp from fp16util import network_to_half, get_param_copy from shufflenet import shufflenet from shufflenet_v2 import shufflenet as shufflenet_v2 def weight_init(m): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) if m.bias is not None: m.bias.data.zero_() elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() #'shufflenet_v2_x0_5', 'shufflenet_v2_x1_0', 'shufflenet_v2_x1_5', 'shufflenet_v2_x2_0', 'shufflenetv2' def get_network(net): if (net == "alexnet"): return torchvision.models.alexnet().to(device="cuda") elif (net == "densenet121"): return torchvision.models.densenet121().to(device="cuda") elif (net == "densenet161"): return torchvision.models.densenet161().to(device="cuda") elif (net == "densenet169"): return torchvision.models.densenet169().to(device="cuda") elif (net == "densenet201"): return torchvision.models.densenet201().to(device="cuda") elif (net == "googlenet"): return torchvision.models.googlenet().to(device="cuda") elif (net == "inception"): return torchvision.models.inception(aux_logits=False).to(device="cuda") elif (net == "inception_v3"): return torchvision.models.inception_v3(aux_logits=False).to(device="cuda") elif (net == "mobilenet_v2"): return torchvision.models.mobilenet_v2().to(device="cuda") elif (net == "resnet18"): return torchvision.models.resnet18().to(device="cuda") elif (net == "resnet34"): return torchvision.models.resnet34().to(device="cuda") elif (net == "resnet50"): return torchvision.models.resnet50().to(device="cuda") elif (net == "resnet101"): return torchvision.models.resnet101().to(device="cuda") elif (net == "resnet152"): return torchvision.models.resnet152().to(device="cuda") elif (net == "resnext50"): return torchvision.models.resnext50_32x4d().to(device="cuda") elif (net == "resnext101"): return torchvision.models.resnext101_32x8d().to(device="cuda") elif (net == "shufflenet"): model = shufflenet().to(device="cuda") model.apply(weight_init) return model elif (net == "shufflenet_v2_x05"): return torchvision.models.shufflenet_v2_x0_5().to(device="cuda") elif (net == "shufflenet_v2_x10"): return torchvision.models.shufflenet_v2_x1_0().to(device="cuda") elif (net == "shufflenet_v2_x15"): return torchvision.models.shufflenet_v2_x1_5().to(device="cuda") elif (net == "SqueezeNet"): return torchvision.models.squeezenet1_0().to(device="cuda") elif (net == "SqueezeNet1.1"): return torchvision.models.squeezenet1_1().to(device="cuda") elif (net == "vgg11"): return torchvision.models.vgg11().to(device="cuda") elif (net == "vgg13"): return torchvision.models.vgg13().to(device="cuda") elif (net == "vgg16"): return torchvision.models.vgg16().to(device="cuda") elif (net == "vgg19"): return torchvision.models.vgg19().to(device="cuda") elif (net == "vgg11_bn"): return torchvision.models.vgg11_bn().to(device="cuda") elif (net == "vgg13_bn"): return torchvision.models.vgg13_bn().to(device="cuda") elif (net == "vgg16_bn"): return torchvision.models.vgg16_bn().to(device="cuda") elif (net == "vgg19_bn"): return torchvision.models.vgg19_bn().to(device="cuda") # segmentation models elif (net == "deeplabv3_resnet50"): return torchvision.models.segmentation.deeplabv3_resnet50().to(device="cuda") elif (net == "deeplabv3_resnet101"): return torchvision.models.segmentation.deeplabv3_resnet101().to(device="cuda") elif (net == "fcn_resnet50"): return torchvision.models.segmentation.deeplabv3_resnet50().to(device="cuda") elif (net == "fcn_resnet101"): return torchvision.models.segmentation.deeplabv3_resnet101().to(device="cuda") else: print ("ERROR: not a supported model.") sys.exit(1) def forwardbackward(inp, optimizer, network, target): optimizer.zero_grad() out = network(inp) # WIP: googlenet, deeplabv3_*, fcn_* missing log_softmax for this to work loss = torch.nn.functional.cross_entropy(out, target) loss.backward() optimizer.step() def rendezvous(distributed_parameters): print("Initializing process group...") torch.distributed.init_process_group(backend=distributed_parameters['dist_backend'], init_method=distributed_parameters['dist_url'], rank=distributed_parameters['rank'], world_size=distributed_parameters['world_size']) print("Rendezvous complete. Created process group...") def run_benchmarking_wrapper(net, batch_size, iterations, run_fp16, dataparallel, distributed_dataparallel, device_ids=None, distributed_parameters=None): if (dataparallel or distributed_dataparallel): ngpus = len(device_ids) if device_ids else torch.cuda.device_count() else: ngpus = 1 if (distributed_dataparallel): # Assumption below that each process launched with --distributed_dataparallel has the same number of devices visible/specified distributed_parameters['world_size'] = ngpus * distributed_parameters['world_size'] distributed_parameters['rank'] = ngpus * distributed_parameters['rank'] mp.spawn(run_benchmarking, nprocs=ngpus, args=(ngpus, net, batch_size, iterations, run_fp16, dataparallel, distributed_dataparallel, device_ids, distributed_parameters)) else: run_benchmarking(0, ngpus, net, batch_size, iterations, run_fp16, dataparallel, distributed_dataparallel, device_ids=None, distributed_parameters=None) def run_benchmarking(local_rank, ngpus, net, batch_size, iterations, run_fp16, dataparallel, distributed_dataparallel, device_ids=None, distributed_parameters=None): if device_ids: assert ngpus == len(device_ids) torch.cuda.set_device("cuda:%d" % device_ids[local_rank]) else: torch.cuda.set_device("cuda:0") network = get_network(net) if (run_fp16): network = network_to_half(network) if (dataparallel): devices_to_run_on = device_ids if device_ids else list(range(ngpus)) print ("INFO: Running dataparallel on devices: {}".format(str(devices_to_run_on))) network = torch.nn.DataParallel(network, device_ids=devices_to_run_on) elif (distributed_dataparallel): distributed_parameters['rank'] += local_rank rendezvous(distributed_parameters) devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)] print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on))) network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) batch_size = int(batch_size / ngpus) if (net == "inception_v3"): inp = torch.randn(batch_size, 3, 299, 299, device="cuda") else: inp = torch.randn(batch_size, 3, 224, 224, device="cuda") if (run_fp16): inp = inp.half() target = torch.arange(batch_size, device="cuda") param_copy = network.parameters() if (run_fp16): param_copy = get_param_copy(network) optimizer = torch.optim.SGD(param_copy, lr = 0.01, momentum = 0.9) ## warmup. print ("INFO: running forward and backward for warmup.") forwardbackward(inp, optimizer, network, target) forwardbackward(inp, optimizer, network, target) time.sleep(1) torch.cuda.synchronize() ## benchmark. print ("INFO: running the benchmark..") tm = time.time() for i in range(iterations): forwardbackward(inp, optimizer, network, target) torch.cuda.synchronize() tm2 = time.time() time_per_batch = (tm2 - tm) / iterations print ("OK: finished running benchmark..") print ("--------------------SUMMARY--------------------------") print ("Microbenchmark for network : {}".format(net)) if (distributed_dataparallel): print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------"); print ("Num devices: 1") else: print ("Num devices: {}".format(ngpus)) print ("Mini batch size [img] : {}".format(batch_size)) print ("Time per mini-batch : {}".format(time_per_batch)) print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch)) if (distributed_dataparallel): print ("") print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------") world_size = distributed_parameters['world_size'] print ("Num devices: {}".format(world_size)) print ("Mini batch size [img] : {}".format(batch_size*world_size)) print ("Time per mini-batch : {}".format(time_per_batch)) print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch)) def main(): net = args.network batch_size = args.batch_size iterations = args.iterations run_fp16 = args.fp16 dataparallel = args.dataparallel distributed_dataparallel = args.distributed_dataparallel device_ids_str = args.device_ids if (args.device_ids): device_ids = [int(x) for x in device_ids_str.split(",")] else: device_ids = None distributed_parameters = {} distributed_parameters['rank'] = args.rank distributed_parameters['world_size'] = args.world_size distributed_parameters['dist_backend'] = args.dist_backend distributed_parameters['dist_url'] = args.dist_url # Some arguments are required for distributed_dataparallel if distributed_dataparallel: assert args.rank is not None and \ args.world_size is not None and \ args.dist_backend is not None and \ args.dist_url is not None, "rank, world-size, dist-backend and dist-url are required arguments for distributed_dataparallel" run_benchmarking_wrapper(net, batch_size, iterations, run_fp16, dataparallel, distributed_dataparallel, device_ids, distributed_parameters) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--network", type=str, choices=['alexnet', 'vgg11', 'vgg13', 'vgg16', 'vgg19', 'vgg11_bn', 'vgg13_bn', 'vgg16_bn', 'vgg19_bn', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'shufflenet', 'shufflenet_v2_x05', 'shufflenet_v2_x10', 'shufflenet_v2_x15', 'SqueezeNet', 'SqueezeNet1.1', 'densenet121', 'densenet169', 'densenet201', 'densenet161', 'inception', 'inception_v3', 'resnext50', 'resnext101', 'mobilenet_v2', 'googlenet' , 'deeplabv3_resnet50', 'deeplabv3_resnet101', 'fcn_resnet50', 'fcn_resnet101' ], required=True, help="Network to run.") parser.add_argument("--batch-size" , type=int, required=False, default=64, help="Batch size (will be split among devices used by this invocation)") parser.add_argument("--iterations", type=int, required=False, default=20, help="Iterations") parser.add_argument("--fp16", type=int, required=False, default=0,help="FP16 mixed precision benchmarking") parser.add_argument("--dataparallel", action='store_true', required=False, help="Use torch.nn.DataParallel api to run single process on multiple devices. Use only one of --dataparallel or --distributed_dataparallel") parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use torch.nn.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Use only one of --dataparallel or --distributed_dataparallel") parser.add_argument("--device_ids", type=str, required=False, default=None, help="Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run dataparallel or distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes.") parser.add_argument("--rank", type=int, required=False, default=None, help="Rank of this process. Required for --distributed_dataparallel") parser.add_argument("--world-size", type=int, required=False, default=None, help="Total number of ranks/processes. Required for --distributed_dataparallel") parser.add_argument("--dist-backend", type=str, required=False, default=None, help="Backend used for distributed training. Can be one of 'nccl' or 'gloo'. Required for --distributed_dataparallel") parser.add_argument("--dist-url", type=str, required=False, default=None, help="url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel") args = parser.parse_args() main()