# On Titan X (Pascal)
# 8192 x 8192 matmul took: 0.10 sec, 11304.59 G ops/sec
# http://stackoverflow.com/questions/41804380/testing-gpu-with-tensorflow-matrix-multiplication
#
# On V100/fp16 peak 85k for 8192x8192
# timing under matmul_times subdirectory
#
# TODO: figure out the deal with spikes, is it CUDA autotune?

from __future__ import print_function

import ctypes
import errno
from ctypes.util import find_library
from functools import partial

CLOCK_PROCESS_CPUTIME_ID = 2  # time.h
CLOCK_MONOTONIC_RAW = 4
clockid_t = ctypes.c_int
time_t = ctypes.c_long
class timespec(ctypes.Structure):
    _fields_ = [
        ('tv_sec', time_t),         # seconds
        ('tv_nsec', ctypes.c_long)  # nanoseconds
    ]
_clock_gettime = ctypes.CDLL(find_library('rt'), use_errno=True).clock_gettime
_clock_gettime.argtypes = [clockid_t, ctypes.POINTER(timespec)]


def clock_gettime(clk_id):
    tp = timespec()
    if _clock_gettime(clk_id, ctypes.byref(tp)) < 0:
        err = ctypes.get_errno()
        msg = errno.errorcode[err]
        if err == errno.EINVAL:
            msg += (" The clk_id specified is not supported on this system"
                    " clk_id=%r") % (clk_id,)
        raise OSError(err, msg)
    return tp.tv_sec + tp.tv_nsec * 1e-9

try:
    from time import perf_counter, process_time
except ImportError:  # Python <3.3
    perf_counter = partial(clock_gettime, CLOCK_MONOTONIC_RAW)
    perf_counter.__name__ = 'perf_counter'
    process_time = partial(clock_gettime, CLOCK_PROCESS_CPUTIME_ID)
    process_time.__name__ = 'process_time'


import math
import os
import sys
import numpy as np
import tensorflow as tf
import time

import argparse
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')

parser.add_argument('--dtype', type=str, default='float32',
                    help='dtype, float32 or float16')
parser.add_argument('--agg', type=str, default='min',
                    help='min, mean or median')
args = parser.parse_args()

def bench(n):
  if args.dtype == 'float32':
    dtype = tf.float32
  elif args.dtype == 'float16':
    dtype = tf.float16
  else:
    assert False, 'unknown dtype '+args.dtype
  with tf.device("/gpu:0"):
    matrix1 = tf.Variable(tf.ones((n, n), dtype=dtype))
    matrix2 = tf.Variable(tf.ones((n, n), dtype=dtype))
    product = tf.matmul(matrix1, matrix2)


  config = tf.ConfigProto()
  sess = tf.Session(config=config)
  
  sess.run(tf.global_variables_initializer())
  iters = 11

  # pre-warming
  sess.run(product.op)

  times = []
  for i in range(iters):
    start = perf_counter()
    sess.run(product.op)
    times.append(perf_counter()-start)

  ops = n**3 + (n-1)*n**2 # n^2*(n-1) additions, n^3 multiplications

  times_ms = 1000*np.array(times)  # get seconds, convert to ms
  if len(times_ms)>0:
    min = np.min(times_ms)
    median = np.median(times_ms)
    formatted = ["%.2f"%(d,) for d in times_ms[:10]]
    #    print("Times: min: %.2f, median: %.2f, mean: %.2f"%(min, median,
    #                                                        np.mean(times_ms)))

  if args.agg == 'min':
    elapsed_ms = np.min(times_ms)
  elif args.agg == 'mean':
    elapsed_ms = np.mean(times_ms)
  elif args.agg == 'median':
    elapsed_ms = np.median(times_ms)
  else:
    assert False, 'unknown aggregation method: ' + args.agg
    
  rate = ops/elapsed_ms/10**9
  #  print('\n %d x %d matmul took: %.4f ms, %.2f G ops/sec' % (n, n,
  #                                                             elapsed_ms,
  #                                                             rate,))
  return rate

def main():
  steps = 8 # number of steps between n doubling

  np.set_printoptions(suppress=True)
  with open("times.csv", "w") as myfile:
    myfile.write("\n")
    
  for i in range(20*steps):
    n = int(math.pow(2, float(i)/steps))
    rate = bench(n)
    print("%d,%.10f" %(n, rate))
    with open("times.csv", "a") as myfile:
      myfile.write("%d,%.10f\n"%(n, rate))
  
if __name__=='__main__':
  main()