import torch import time import argparse import numpy as np try: import intel_extension_for_pytorch as ipex from torch import xpu except ImportError: pass from torch import mps, cuda parser = argparse.ArgumentParser(description='Measure FLOPs and BW.') parser.add_argument('--device', type=str, default='cpu', help='One of cpu | cuda | mps | xpu') parser.add_argument('--num_trails', type=int, default=10, help='Number of trails to get average.') parser.add_argument('--dtype', type=str, default='float', help='One of float32|float64|float16|bfloat16|int8|int16|int32|bool') args = parser.parse_args() dtype = getattr(torch, args.dtype) device = torch.device(args.device) num_trails = args.num_trails def flops_benchmark(device): test_range = 2 ** np.arange(8, 13, 0.25) print(f'benchmarking {device} using {dtype}') print('size, elapsed_time, tops') for n in test_range: total = 0 for _ in range(num_trails): n = int(n) a = 10 * torch.rand(n, n, device=device) a = a.to(dtype) synchronize(device) now = time.time() b = torch.matmul(a, a) synchronize(device) total += time.time() - now total = total / num_trails tflops = 2 * n**3 / total / 1e12 print(n, total, tflops, sep=", ") def synchronize(device): if device.type == "cuda": cuda.synchronize() elif device.type == "mps": mps.synchronize() elif device.type == "xpu": xpu.synchronize() elif device.type == "cpu": pass def memory_bandwidth_benchmark(device): test_range = 2 ** (np.arange(20, 27, 0.5)) print('size (GB), elapsed_time, bandwidth (GB/s)') for size in test_range: elapsed_time = 0 for _ in range(num_trails): size = int(size) # Create random tensors a = torch.rand(size, device=device) b = torch.rand(size, device=device) # Warm-up to ensure CUDA kernel is initialized if using GPU synchronize(device) a.copy_(b) synchronize(device) # Record the start time start_time = time.time() # Perform the copy operation a.copy_(b) # Synchronize if using CUDA to make sure operation is finished synchronize(device) # Record the end time end_time = time.time() # Compute elapsed time elapsed_time += end_time - start_time elapsed_time = elapsed_time / num_trails # Calculate Bandwidth in GB/s bytes_copied = a.nelement() * a.element_size() # bytes bandwidth = 2 * bytes_copied / elapsed_time / 1e9 # GB/s print(bytes_copied / 1e9, elapsed_time, bandwidth, sep=', ') return bandwidth if __name__ == "__main__": flops_benchmark(device) memory_bandwidth_benchmark(device)