import gc
import time
import ctypes
import math

##VERY IMPORTANT: comfy_aimfo.control.init() must be called before torch is imported or anything that
#imports torch (including comfy_aimdo.torch(

import comfy_aimdo.control
comfy_aimdo.control.init()
comfy_aimdo.control.set_log_info()
#comfy_aimdo.control.set_log_debug() #use this to see much more information
#comfy_aimdo.control.set_log_verbose() #use this to see even more information (there is also vverbose)

import torch
import comfy_aimdo.torch
from comfy_aimdo.model_vbar import ModelVBAR, vbar_fault, vbar_unpin, vbar_signature_compare

comfy_aimdo.control.init_device(torch.device(torch.cuda.current_device()).index)

signatures = {}

M = (1024 ** 2)

def run_layer(input_tensor, weight, cpu_source, weight_offset): #NOTE: offset just for prints
    vbar, ptr, size = weight
    signature = vbar_fault(weight)
    if signature is not None:
        weight_tensor = comfy_aimdo.torch.aimdo_to_tensor(weight, torch.device("cuda:0")).view(dtype=input_tensor.dtype).view(input_tensor.shape)
        if not vbar_signature_compare(signature, signatures.get(weight, None)):
            weight_tensor.copy_(cpu_source)
            if weight_offset is not None:
                print(f"[First Load] Populated weight at offset: {weight_offset / M}M")
        elif weight_offset is not None:
            print(f"[No Load Needed] Reusing weight at offset: {weight_offset / M}M")
        w = weight_tensor
        signatures[weight] = signature
    else:
        weight = None
        if weight_offset is not None:
            print(f"[Offloaded] offset: {weight_offset / M}M")
        w = cpu_source.to("cuda:0", non_blocking=True)
        
    #Layer math here
    output = input_tensor + w

    if weight is not None:
        vbar_unpin(weight)

    return output

def run_model(weights, cpu_weight, sleep=0):
    x = torch.zeros(cpu_weight.shape, device="cuda:0", dtype=torch.float16)
    for i in range(6): # Iteration loop
        print(f"\nIteration {i}")
        weight_offset = 0 #just for print messages
        if (i > 2):
            print("...")
            weight_offset = None

        for layer_weight in weights:
            x = run_layer(x, layer_weight, cpu_weight, weight_offset)
            if weight_offset is not None:
                weight_offset += cpu_weight.numel() * cpu_weight.element_size()
        time.sleep(sleep) #so you can see nvtop
    gc.collect()
    torch.cuda.empty_cache()

gpu_size = torch.cuda.get_device_properties(torch.cuda.current_device()).total_memory
dtype = torch.float16

#A big model, with 30 weights filling 1.5X the available VRAM
num_layers = 30
scale_factor = gpu_size * 3 // (2 *num_layers * (1024 * 1024) * dtype.itemsize)
vbar1 = ModelVBAR(gpu_size * 5, device=0) #The vbar can be much bigger than VRAM
shape = (1024, 1024, scale_factor)

weights1 = [vbar1.alloc(math.prod(shape) * dtype.itemsize) for _ in range(num_layers)]
#just share one weight in this example, as don't complicate this example
#with RAM usage. in the real world this will be separate weights for every layer
cpu_weight1 = torch.ones(shape, dtype=dtype)

print("##################### Run the first model #######################")
print("Some weights will be loaded and stay there for all iterations")
print("Some weights will be offloaded\n")

run_model(weights1, cpu_weight1)
comfy_aimdo.control.analyze() #print some stats

#A smaller second model but with chunkier weights
num_layers=3
vbar2 = ModelVBAR(gpu_size * 5, device=0) #The vbar can be much bigger than VRAM
shape = (1024, 1024, 2, scale_factor)

weights2 = [ vbar2.alloc(math.prod(shape) * dtype.itemsize) for _ in range(num_layers)]
cpu_weight2 = torch.ones(shape, dtype=dtype)

print("##################### Run the second model #######################")
print("Everything will be loaded and will displace some weights of the first model\n")

run_model(weights2, cpu_weight2, sleep=0.5)
comfy_aimdo.control.analyze() #print some stats

print("##################### Run the first model again #######################")
print("Some weights will still be loaded from before and be there first iteration")
print("Some weights will get re-loaded on the first interation")
print("The rest will be offloaded again\n")

vbar1.prioritize()
run_model(weights1, cpu_weight1)
comfy_aimdo.control.analyze() #print some stats