{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zJvyUmZdktu0", "outputId": "25275dd8-4787-4b77-f7bf-4f2bbe66f0b8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/workspace/axolotl\n" ] } ], "source": [ "%cd /workspace/axolotl" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mC48y25Lkqa5", "outputId": "2757a3c8-3790-4fd3-be39-03be8f533b35" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\n", "accelerate configuration saved at /root/.cache/huggingface/accelerate/default_config.yaml\n" ] } ], "source": [ "!accelerate config --config_file configs/accelerate/default_config.yaml default" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "--s-HPfqUwH9", "outputId": "c524f89e-bafc-486e-b760-ff4697c16da6" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "base_model: tiiuae/falcon-rw-1b\n", "base_model_config: tiiuae/falcon-rw-1b\n", "trust_remote_code: true\n", "model_type: AutoModelForCausalLM\n", "tokenizer_type: AutoTokenizer\n", "load_in_8bit: true\n", "load_in_4bit: false\n", "gptq: false\n", "strict: false\n", "push_dataset_to_hub:\n", "datasets:\n", " - path: teknium/GPT4-LLM-Cleaned\n", " type: alpaca:chat\n", "dataset_prepared_path: last_run_prepared\n", "val_set_size: 0.01\n", "adapter: lora\n", "lora_model_dir:\n", "sequence_len: 2048\n", "max_packed_sequence_len:\n", "lora_r: 16\n", "lora_alpha: 32\n", "lora_dropout: 0.0\n", "lora_target_modules:\n", "lora_target_linear: true\n", "lora_fan_in_fan_out:\n", "wandb_project: falcon-rw-1b\n", "wandb_watch:\n", "wandb_run_id:\n", "wandb_log_model:\n", "output_dir: ./falcon-rw-1b\n", "batch_size: 2\n", "micro_batch_size: 1\n", "num_epochs: 4\n", "optimizer: adamw_bnb_8bit\n", "torchdistx_path:\n", "lr_scheduler: cosine\n", "learning_rate: 0.00003\n", "train_on_inputs: false\n", "group_by_length: false\n", "bf16: true\n", "fp16: false\n", "tf32: true\n", "gradient_checkpointing: true\n", "early_stopping_patience:\n", "resume_from_checkpoint:\n", "local_rank:\n", "logging_steps: 1\n", "xformers_attention: true\n", "flash_attention:\n", "gptq_groupsize:\n", "gptq_model_v1:\n", "warmup_steps: 40\n", "eval_steps: 5\n", "save_steps: 43\n", "debug:\n", "deepspeed:\n", "weight_decay: 0.0\n", "fsdp:\n", "fsdp_config:\n", "special_tokens:\n", " pad_token: \"<|endoftext|>\"\n", " bos_token: \">>ABSTRACT<<\"\n", " eos_token: \"<|endoftext|>\"" ] } ], "source": [ "!cat examples/falcon/config-1b-lora.yml" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "jICMPJuomFsx" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\n", "\n", "===================================BUG REPORT===================================\n", "Welcome to bitsandbytes. For bug reports, please run\n", "\n", "python -m bitsandbytes\n", "\n", " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", "================================================================================\n", "bin /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/libbitsandbytes_cuda118.so\n", "False\n", "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib'), PosixPath('/usr/local/nvidia/lib64')}\n", " warn(msg)\n", "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n", " warn(msg)\n", "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n", " warn(msg)\n", "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIK1tFOFrWbmoa2ckCJYhzgBHKTSMeR/AeuScCCzugqlI utensilcandel@gmail.com')}\n", " warn(msg)\n", "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n", "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: Found duplicate ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] files: {PosixPath('/usr/local/cuda/lib64/libcudart.so'), PosixPath('/usr/local/cuda/lib64/libcudart.so.11.0')}.. We'll flip a coin and try one of these, in order to fail forward.\n", "Either way, this might cause trouble in the future:\n", "If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.\n", " warn(msg)\n", "CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so\n", "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n", "CUDA SETUP: Detected CUDA version 118\n", "CUDA SETUP: Required library version not found: libbitsandbytes_cuda118.so. Maybe you need to compile it from source?\n", "CUDA SETUP: Defaulting to libbitsandbytes_cpu.so...\n", "\n", "================================================ERROR=====================================\n", "CUDA SETUP: CUDA detection failed! Possible reasons:\n", "1. CUDA driver not installed\n", "2. CUDA not installed\n", "3. You have multiple conflicting CUDA libraries\n", "4. Required library not pre-compiled for this bitsandbytes release!\n", "CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`.\n", "CUDA SETUP: The CUDA version for the compile might depend on your conda install. Inspect CUDA version via `conda list | grep cuda`.\n", "================================================================================\n", "\n", "CUDA SETUP: Something unexpected happened. Please compile from source:\n", "git clone git@github.com:TimDettmers/bitsandbytes.git\n", "cd bitsandbytes\n", "CUDA_VERSION=118 make cuda11x\n", "python setup.py install\n", "CUDA SETUP: Setup Failed!\n", "Traceback (most recent call last):\n", " File \"/workspace/axolotl/scripts/finetune.py\", line 24, in \n", " from axolotl.utils.models import load_model, load_tokenizer\n", " File \"/workspace/axolotl/src/axolotl/utils/models.py\", line 7, in \n", " import bitsandbytes as bnb\n", " File \"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/__init__.py\", line 6, in \n", " from . import cuda_setup, utils, research\n", " File \"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/research/__init__.py\", line 1, in \n", " from . import nn\n", " File \"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/research/nn/__init__.py\", line 1, in \n", " from .modules import LinearFP8Mixed, LinearFP8Global\n", " File \"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/research/nn/modules.py\", line 8, in \n", " from bitsandbytes.optim import GlobalOptimManager\n", " File \"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/optim/__init__.py\", line 6, in \n", " from bitsandbytes.cextension import COMPILED_WITH_CUDA\n", " File \"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cextension.py\", line 20, in \n", " raise RuntimeError('''\n", "RuntimeError: \n", " CUDA Setup failed despite GPU being available. Please run the following command to get more information:\n", "\n", " python -m bitsandbytes\n", "\n", " Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them\n", " to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes\n", " and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues\n", "Traceback (most recent call last):\n", " File \"/root/miniconda3/envs/py3.9/bin/accelerate\", line 8, in \n", " sys.exit(main())\n", " File \"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/accelerate/commands/accelerate_cli.py\", line 45, in main\n", " args.func(args)\n", " File \"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/accelerate/commands/launch.py\", line 928, in launch_command\n", " simple_launcher(args)\n", " File \"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/accelerate/commands/launch.py\", line 588, in simple_launcher\n", " raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)\n", "subprocess.CalledProcessError: Command '['/root/miniconda3/envs/py3.9/bin/python3', 'scripts/finetune.py', 'examples/falcon/config-1b-lora.yml']' returned non-zero exit status 1.\n" ] } ], "source": [ "!accelerate launch scripts/finetune.py examples/falcon/config-1b-lora.yml" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "0S87GQqWnzzc" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/workspace\n" ] } ], "source": [ "%cd /workspace" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cloning into 'bitsandbytes'...\n", "remote: Enumerating objects: 5251, done.\u001b[K\n", "remote: Counting objects: 100% (5250/5250), done.\u001b[K\n", "remote: Compressing objects: 100% (1517/1517), done.\u001b[K\n", "remote: Total 5251 (delta 3717), reused 5169 (delta 3684), pack-reused 1\u001b[K\n", "Receiving objects: 100% (5251/5251), 1.48 MiB | 7.54 MiB/s, done.\n", "Resolving deltas: 100% (3717/3717), done.\n" ] } ], "source": [ "!git clone https://github.com/TimDettmers/bitsandbytes" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/workspace/bitsandbytes\n" ] } ], "source": [ "%cd bitsandbytes" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mkdir -p build\n", "mkdir -p dependencies\n", "ENVIRONMENT\n", "============================\n", "CUDA_VERSION: 118\n", "============================\n", "NVCC path: /usr/local/cuda/bin/nvcc\n", "GPP path: /usr/bin/g++ VERSION: g++ (Ubuntu 11.3.0-1ubuntu1~22.04) 11.3.0\n", "CUDA_HOME: /usr/local/cuda\n", "CONDA_PREFIX: \n", "PATH: /root/miniconda3/envs/py3.9/bin:/root/miniconda3/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin\n", "LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64\n", "============================\n", "/usr/local/cuda/bin/nvcc -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc /workspace/bitsandbytes/csrc/ops.cu /workspace/bitsandbytes/csrc/kernels.cu -I /usr/local/cuda/include -I /workspace/bitsandbytes/csrc -I /include -I /workspace/bitsandbytes/include -L /usr/local/cuda/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L /lib --output-directory /workspace/bitsandbytes/build\n", "ptxas info : 15 bytes gmem\n", "ptxas info : Compiling entry function '_ZN3cub11EmptyKernelIvEEvv' for 'sm_75'\n", "ptxas info : Function properties for _ZN3cub11EmptyKernelIvEEvv\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 4 registers, 352 bytes cmem[0]\n", "ptxas info : 15 bytes gmem\n", "ptxas info : Compiling entry function '_ZN3cub11EmptyKernelIvEEvv' for 'sm_80'\n", "ptxas info : Function properties for _ZN3cub11EmptyKernelIvEEvv\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 4 registers, 352 bytes cmem[0]\n", "ptxas info : 15 bytes gmem\n", "ptxas info : Compiling entry function '_ZN3cub11EmptyKernelIvEEvv' for 'sm_86'\n", "ptxas info : Function properties for _ZN3cub11EmptyKernelIvEEvv\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 4 registers, 352 bytes cmem[0]\n", "ptxas warning : Value of threads per SM for entry _Z9kQuantizePfS_Phi is out of range. .minnctapersm will be ignored\n", "ptxas info : 31 bytes gmem\n", "ptxas info : Compiling entry function '_ZN3cub11EmptyKernelIvEEvv' for 'sm_75'\n", "ptxas info : Function properties for _ZN3cub11EmptyKernelIvEEvv\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 4 registers, 352 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi4ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi4ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseIfLi4ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseIfLi4ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI13__nv_bfloat16Li5ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI13__nv_bfloat16Li5ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi5ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi5ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseIfLi5ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseIfLi5ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi2ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi2ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseIfLi2ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseIfLi2ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi1ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi1ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseIfLi1ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseIfLi1ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 74 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit2StateBlockwiseI13__nv_bfloat16Li0ELi2048ELi8EEvPT_S2_PhS3_fffifPfS4_S4_S4_ffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit2StateBlockwiseI13__nv_bfloat16Li0ELi2048ELi8EEvPT_S2_PhS3_fffifPfS4_S4_S4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 456 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit2StateBlockwiseI6__halfLi0ELi2048ELi8EEvPT_S2_PhS3_fffifPfS4_S4_S4_ffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit2StateBlockwiseI6__halfLi0ELi2048ELi8EEvPT_S2_PhS3_fffifPfS4_S4_S4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 456 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit2StateBlockwiseIfLi0ELi2048ELi8EEvPT_S1_PhS2_fffifPfS3_S3_S3_ffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit2StateBlockwiseIfLi0ELi2048ELi8EEvPT_S1_PhS2_fffifPfS3_S3_S3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 456 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi2EEvPfPhS0_PT_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi2EEvPfPhS0_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi2EEvPfPhS1_PT_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi2EEvPfPhS1_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi0EEvPfPhS0_PT_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi0EEvPfPhS0_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi0EEvPfPhS1_PT_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi0EEvPfPhS1_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 59 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi1EEvPfPhS0_PT_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi1EEvPfPhS0_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi1EEvPfPhS1_PT_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi1EEvPfPhS1_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 29 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 36 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 36 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 36 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 29 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 36 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 36 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 36 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 38 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 45 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 45 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 45 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 51 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 51 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi4096ELi4ELi1ELi0EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi4096ELi4ELi1ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 57 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 51 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 39 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 44 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 44 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 44 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 50 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 50 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi1ELi0EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi1ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 59 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 50 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z19kPercentileClippingI6__halfLi2048ELi4EEvPT_Pfii' for 'sm_75'\n", "ptxas info : Function properties for _Z19kPercentileClippingI6__halfLi2048ELi4EEvPT_Pfii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 35 registers, 376 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z19kPercentileClippingIfLi2048ELi4EEvPT_Pfii' for 'sm_75'\n", "ptxas info : Function properties for _Z19kPercentileClippingIfLi2048ELi4EEvPT_Pfii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 35 registers, 376 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit2StateIfLi0EEvPT_S1_PhS2_PKffffffifPfS5_S5_S5_S5_S5_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit2StateIfLi0EEvPT_S1_PhS2_PKffffffifPfS5_S5_S5_S5_S5_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 484 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit2StateI6__halfLi0EEvPT_S2_PhS3_PKffffffifPfS6_S6_S6_S6_S6_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit2StateI6__halfLi0EEvPT_S2_PhS3_PKffffffifPfS6_S6_S6_S6_S6_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 484 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit2StateIfLi0EEvPT_S1_PhS2_PffffiS3_S3_S3_S3_S3_S3_fi' for 'sm_75'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit2StateIfLi0EEvPT_S1_PhS2_PffffiS3_S3_S3_S3_S3_S3_fi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 115 registers, 464 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit2StateI6__halfLi0EEvPT_S2_PhS3_PffffiS4_S4_S4_S4_S4_S4_fi' for 'sm_75'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit2StateI6__halfLi0EEvPT_S2_PhS3_PffffiS4_S4_S4_S4_S4_S4_fi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 115 registers, 464 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateIfLi5EEvPT_S1_PhPKffffffifPfS5_S5_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateIfLi5EEvPT_S1_PhPKffffffifPfS5_S5_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateI6__halfLi5EEvPT_S2_PhPKffffffifPfS6_S6_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateI6__halfLi5EEvPT_S2_PhPKffffffifPfS6_S6_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateIfLi2EEvPT_S1_PhPKffffffifPfS5_S5_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateIfLi2EEvPT_S1_PhPKffffffifPfS5_S5_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateI6__halfLi2EEvPT_S2_PhPKffffffifPfS6_S6_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateI6__halfLi2EEvPT_S2_PhPKffffffifPfS6_S6_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateIfLi1EEvPT_S1_PhPKffffffifPfS5_S5_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateIfLi1EEvPT_S1_PhPKffffffifPfS5_S5_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateI6__halfLi1EEvPT_S2_PhPKffffffifPfS6_S6_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateI6__halfLi1EEvPT_S2_PhPKffffffifPfS6_S6_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateIfLi5EEvPT_S1_PhPffffiS3_S3_S3_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateIfLi5EEvPT_S1_PhPffffiS3_S3_S3_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi5EEvPT_S2_PhPffffiS4_S4_S4_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi5EEvPT_S2_PhPffffiS4_S4_S4_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateIfLi2EEvPT_S1_PhPffffiS3_S3_S3_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateIfLi2EEvPT_S1_PhPffffiS3_S3_S3_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi2EEvPT_S2_PhPffffiS4_S4_S4_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi2EEvPT_S2_PhPffffiS4_S4_S4_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateIfLi1EEvPT_S1_PhPffffiS3_S3_S3_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateIfLi1EEvPT_S1_PhPffffiS3_S3_S3_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi1EEvPT_S2_PhPffffiS4_S4_S4_ffi' for 'sm_75'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi1EEvPT_S2_PhPffffiS4_S4_S4_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit2StateI13__nv_bfloat16Li0EEvPT_S2_PfS3_S3_ffffffiffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z21kOptimizer32bit2StateI13__nv_bfloat16Li0EEvPT_S2_PfS3_S3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit2StateI6__halfLi0EEvPT_S2_PfS3_S3_ffffffiffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z21kOptimizer32bit2StateI6__halfLi0EEvPT_S2_PfS3_S3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit2StateIfLi0EEvPT_S1_PfS2_S2_ffffffiffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z21kOptimizer32bit2StateIfLi0EEvPT_S1_PfS2_S2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit2StateI13__nv_bfloat16Li0ELi4096ELi8EEvPT_S2_PfS3_S3_ffffiffi' for 'sm_75'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit2StateI13__nv_bfloat16Li0ELi4096ELi8EEvPT_S2_PfS3_S3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 57 registers, 424 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit2StateI6__halfLi0ELi4096ELi8EEvPT_S2_PfS3_S3_ffffiffi' for 'sm_75'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit2StateI6__halfLi0ELi4096ELi8EEvPT_S2_PfS3_S3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 57 registers, 424 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit2StateIfLi0ELi4096ELi8EEvPT_S1_PfS2_S2_ffffiffi' for 'sm_75'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit2StateIfLi0ELi4096ELi8EEvPT_S1_PfS2_S2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 55 registers, 424 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateIfLi4EEvPT_S1_PfS2_ffffffiffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateIfLi4EEvPT_S1_PfS2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 47 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI6__halfLi4EEvPT_S2_PfS3_ffffffiffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI6__halfLi4EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI13__nv_bfloat16Li5EEvPT_S2_PfS3_ffffffiffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI13__nv_bfloat16Li5EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 49 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateIfLi5EEvPT_S1_PfS2_ffffffiffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateIfLi5EEvPT_S1_PfS2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 49 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI6__halfLi5EEvPT_S2_PfS3_ffffffiffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI6__halfLi5EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 50 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateIfLi2EEvPT_S1_PfS2_ffffffiffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateIfLi2EEvPT_S1_PfS2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 49 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI6__halfLi2EEvPT_S2_PfS3_ffffffiffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI6__halfLi2EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 50 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateIfLi1EEvPT_S1_PfS2_ffffffiffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateIfLi1EEvPT_S1_PfS2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 47 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI6__halfLi1EEvPT_S2_PfS3_ffffffiffbi' for 'sm_75'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI6__halfLi1EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 50 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateIfLi4ELi4096ELi8EEvPT_S1_PfS2_ffffiffi' for 'sm_75'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateIfLi4ELi4096ELi8EEvPT_S1_PfS2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 47 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI6__halfLi4ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_75'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI6__halfLi4ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 53 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI13__nv_bfloat16Li5ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_75'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI13__nv_bfloat16Li5ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 47 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateIfLi5ELi4096ELi8EEvPT_S1_PfS2_ffffiffi' for 'sm_75'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateIfLi5ELi4096ELi8EEvPT_S1_PfS2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI6__halfLi5ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_75'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI6__halfLi5ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 53 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateIfLi2ELi4096ELi8EEvPT_S1_PfS2_ffffiffi' for 'sm_75'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateIfLi2ELi4096ELi8EEvPT_S1_PfS2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI6__halfLi2ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_75'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI6__halfLi2ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 53 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateIfLi1ELi4096ELi8EEvPT_S1_PfS2_ffffiffi' for 'sm_75'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateIfLi1ELi4096ELi8EEvPT_S1_PfS2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 47 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI6__halfLi1ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_75'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI6__halfLi1ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 43 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kEstimateQuantilesI6__halfEvPT_PffS1_i' for 'sm_75'\n", "ptxas info : Function properties for _Z18kEstimateQuantilesI6__halfEvPT_PffS1_i\n", " 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 82 registers, 380 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kEstimateQuantilesIfEvPT_PffS0_i' for 'sm_75'\n", "ptxas info : Function properties for _Z18kEstimateQuantilesIfEvPT_PffS0_i\n", " 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 84 registers, 380 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kDoubleRowColQuantILi64ELi4ELi16ELi256ELi1EEvP6__halfPfS2_PcS3_PiS4_S1_S4_fiii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kDoubleRowColQuantILi64ELi4ELi16ELi256ELi1EEvP6__halfPfS2_PcS3_PiS4_S1_S4_fiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 38 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kDoubleRowColQuantILi64ELi4ELi16ELi256ELi0EEvP6__halfPfS2_PcS3_PiS4_S1_S4_fiii' for 'sm_75'\n", "ptxas info : Function properties for _Z18kDoubleRowColQuantILi64ELi4ELi16ELi256ELi0EEvP6__halfPfS2_PcS3_PiS4_S1_S4_fiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 36 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z22kdequant_mm_int32_fp16ILi4ELi128ELi512EEvPiPfS1_P6__halfS1_S1_S3_iiii' for 'sm_75'\n", "ptxas info : Function properties for _Z22kdequant_mm_int32_fp16ILi4ELi128ELi512EEvPiPfS1_P6__halfS1_S1_S3_iiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 42 registers, 424 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi4EEvPcS0_iiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi4EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi4EEvPcS0_iiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi4EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 42 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi3EEvPcS0_iiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi3EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi3EEvPcS0_iiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi3EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 43 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi2EEvPcS0_iiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi2EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 35 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi2EEvPcS0_iiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi2EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveIaLi32ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveIaLi32ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveIaLi16ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveIaLi16ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveIaLi8ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveIaLi8ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveI6__halfLi32ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveI6__halfLi32ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveI6__halfLi16ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveI6__halfLi16ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveI6__halfLi8ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveI6__halfLi8ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z16kExtractOutliersILi4EEvPcPiS0_iiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z16kExtractOutliersILi4EEvPcPiS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 14 registers, 396 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z16kExtractOutliersILi3EEvPcPiS0_iiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z16kExtractOutliersILi3EEvPcPiS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 14 registers, 396 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kgemm_4bit_inferenceI6__halfLi160EEviiiPT_PhPfS2_iiii' for 'sm_75'\n", "ptxas info : Function properties for _Z20kgemm_4bit_inferenceI6__halfLi160EEviiiPT_PhPfS2_iiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 416 bytes cmem[0], 48 bytes cmem[2]\n", "ptxas info : Compiling entry function '_Z20kgemm_4bit_inferenceI6__halfLi128EEviiiPT_PhPfS2_iiii' for 'sm_75'\n", "ptxas info : Function properties for _Z20kgemm_4bit_inferenceI6__halfLi128EEviiiPT_PhPfS2_iiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 416 bytes cmem[0], 48 bytes cmem[2]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi96EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi96EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi64EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi64EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi32EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi32EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi128EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi128EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi160EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi160EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi192EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi192EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi256EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi256EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi96EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi96EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi64EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi64EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi32EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi32EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi128EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi128EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi160EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi160EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi192EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi192EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi256EEviiiPT_S2_S2_iii' for 'sm_75'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi256EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z5kfuncIfLi2EEvPT_S1_S0_l' for 'sm_75'\n", "ptxas info : Function properties for _Z5kfuncIfLi2EEvPT_S1_S0_l\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 384 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z5kfuncIfLi1EEvPT_S1_S0_l' for 'sm_75'\n", "ptxas info : Function properties for _Z5kfuncIfLi1EEvPT_S1_S0_l\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 30 registers, 384 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z5kfuncIhLi0EEvPT_S1_S0_l' for 'sm_75'\n", "ptxas info : Function properties for _Z5kfuncIhLi0EEvPT_S1_S0_l\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 384 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z5kfuncIfLi0EEvPT_S1_S0_l' for 'sm_75'\n", "ptxas info : Function properties for _Z5kfuncIfLi0EEvPT_S1_S0_l\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 384 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z15kgetColRowStatsI6__halfLi64ELi4ELi16ELi256ELi1EEvPT_PfS3_Pifiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z15kgetColRowStatsI6__halfLi64ELi4ELi16ELi256ELi1EEvPT_PfS3_Pifiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z15kgetColRowStatsI6__halfLi64ELi4ELi16ELi256ELi0EEvPT_PfS3_Pifiiii' for 'sm_75'\n", "ptxas info : Function properties for _Z15kgetColRowStatsI6__halfLi64ELi4ELi16ELi256ELi0EEvPT_PfS3_Pifiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 27 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11kDequantizePfPhS_i' for 'sm_75'\n", "ptxas info : Function properties for _Z11kDequantizePfPhS_i\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 12 registers, 1024 bytes smem, 380 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z9kQuantizePfS_Phi' for 'sm_75'\n", "ptxas info : Function properties for _Z9kQuantizePfS_Phi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 52 registers, 21520 bytes smem, 380 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z22kHistogramScatterAdd2DPfPiS0_S_ii' for 'sm_75'\n", "ptxas info : Function properties for _Z22kHistogramScatterAdd2DPfPiS0_S_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 14 registers, 392 bytes cmem[0]\n", "ptxas info : Function properties for _Z9dQuantizeILi1EEhPfff\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z9dQuantizeILi0EEhPfff\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z12dQuantizeNF4f\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z14dDequantizeNF4h\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z15dhDequantizeNF4h\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z12dQuantizeFP4f\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z18dDequantizeFP4Treehf\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z15d2DequantizeFP4h\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z14dDequantizeFP4hf\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z9atomicMinPff\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z9atomicMaxPff\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas warning : Value of threads per SM for entry _Z9kQuantizePfS_Phi is out of range. .minnctapersm will be ignored\n", "ptxas info : 31 bytes gmem\n", "ptxas info : Compiling entry function '_ZN3cub11EmptyKernelIvEEvv' for 'sm_80'\n", "ptxas info : Function properties for _ZN3cub11EmptyKernelIvEEvv\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 4 registers, 352 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi4ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi4ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseIfLi4ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseIfLi4ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI13__nv_bfloat16Li5ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI13__nv_bfloat16Li5ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi5ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi5ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseIfLi5ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseIfLi5ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi2ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi2ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseIfLi2ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseIfLi2ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi1ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi1ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseIfLi1ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseIfLi1ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 74 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit2StateBlockwiseI13__nv_bfloat16Li0ELi2048ELi8EEvPT_S2_PhS3_fffifPfS4_S4_S4_ffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit2StateBlockwiseI13__nv_bfloat16Li0ELi2048ELi8EEvPT_S2_PhS3_fffifPfS4_S4_S4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 456 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit2StateBlockwiseI6__halfLi0ELi2048ELi8EEvPT_S2_PhS3_fffifPfS4_S4_S4_ffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit2StateBlockwiseI6__halfLi0ELi2048ELi8EEvPT_S2_PhS3_fffifPfS4_S4_S4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 456 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit2StateBlockwiseIfLi0ELi2048ELi8EEvPT_S1_PhS2_fffifPfS3_S3_S3_ffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit2StateBlockwiseIfLi0ELi2048ELi8EEvPT_S1_PhS2_fffifPfS3_S3_S3_ffbi\n", " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", "ptxas info : Used 80 registers, 456 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi2EEvPfPhS0_PT_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi2EEvPfPhS0_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 56 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi2EEvPfPhS1_PT_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi2EEvPfPhS1_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 56 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi0EEvPfPhS0_PT_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi0EEvPfPhS0_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi0EEvPfPhS1_PT_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi0EEvPfPhS1_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi1EEvPfPhS0_PT_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi1EEvPfPhS0_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi1EEvPfPhS1_PT_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi1EEvPfPhS1_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 30 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 30 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 30 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 30 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 30 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 30 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 30 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 30 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi4096ELi4ELi1ELi0EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi4096ELi4ELi1ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi1ELi0EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi1ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z19kPercentileClippingI6__halfLi2048ELi4EEvPT_Pfii' for 'sm_80'\n", "ptxas info : Function properties for _Z19kPercentileClippingI6__halfLi2048ELi4EEvPT_Pfii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 376 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z19kPercentileClippingIfLi2048ELi4EEvPT_Pfii' for 'sm_80'\n", "ptxas info : Function properties for _Z19kPercentileClippingIfLi2048ELi4EEvPT_Pfii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 376 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit2StateIfLi0EEvPT_S1_PhS2_PKffffffifPfS5_S5_S5_S5_S5_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit2StateIfLi0EEvPT_S1_PhS2_PKffffffifPfS5_S5_S5_S5_S5_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 484 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit2StateI6__halfLi0EEvPT_S2_PhS3_PKffffffifPfS6_S6_S6_S6_S6_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit2StateI6__halfLi0EEvPT_S2_PhS3_PKffffffifPfS6_S6_S6_S6_S6_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 484 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit2StateIfLi0EEvPT_S1_PhS2_PffffiS3_S3_S3_S3_S3_S3_fi' for 'sm_80'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit2StateIfLi0EEvPT_S1_PhS2_PffffiS3_S3_S3_S3_S3_S3_fi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 115 registers, 464 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit2StateI6__halfLi0EEvPT_S2_PhS3_PffffiS4_S4_S4_S4_S4_S4_fi' for 'sm_80'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit2StateI6__halfLi0EEvPT_S2_PhS3_PffffiS4_S4_S4_S4_S4_S4_fi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 120 registers, 464 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateIfLi5EEvPT_S1_PhPKffffffifPfS5_S5_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateIfLi5EEvPT_S1_PhPKffffffifPfS5_S5_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateI6__halfLi5EEvPT_S2_PhPKffffffifPfS6_S6_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateI6__halfLi5EEvPT_S2_PhPKffffffifPfS6_S6_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateIfLi2EEvPT_S1_PhPKffffffifPfS5_S5_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateIfLi2EEvPT_S1_PhPKffffffifPfS5_S5_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateI6__halfLi2EEvPT_S2_PhPKffffffifPfS6_S6_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateI6__halfLi2EEvPT_S2_PhPKffffffifPfS6_S6_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateIfLi1EEvPT_S1_PhPKffffffifPfS5_S5_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateIfLi1EEvPT_S1_PhPKffffffifPfS5_S5_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateI6__halfLi1EEvPT_S2_PhPKffffffifPfS6_S6_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateI6__halfLi1EEvPT_S2_PhPKffffffifPfS6_S6_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateIfLi5EEvPT_S1_PhPffffiS3_S3_S3_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateIfLi5EEvPT_S1_PhPffffiS3_S3_S3_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi5EEvPT_S2_PhPffffiS4_S4_S4_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi5EEvPT_S2_PhPffffiS4_S4_S4_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateIfLi2EEvPT_S1_PhPffffiS3_S3_S3_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateIfLi2EEvPT_S1_PhPffffiS3_S3_S3_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi2EEvPT_S2_PhPffffiS4_S4_S4_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi2EEvPT_S2_PhPffffiS4_S4_S4_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateIfLi1EEvPT_S1_PhPffffiS3_S3_S3_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateIfLi1EEvPT_S1_PhPffffiS3_S3_S3_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi1EEvPT_S2_PhPffffiS4_S4_S4_ffi' for 'sm_80'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi1EEvPT_S2_PhPffffiS4_S4_S4_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit2StateI13__nv_bfloat16Li0EEvPT_S2_PfS3_S3_ffffffiffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z21kOptimizer32bit2StateI13__nv_bfloat16Li0EEvPT_S2_PfS3_S3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit2StateI6__halfLi0EEvPT_S2_PfS3_S3_ffffffiffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z21kOptimizer32bit2StateI6__halfLi0EEvPT_S2_PfS3_S3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit2StateIfLi0EEvPT_S1_PfS2_S2_ffffffiffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z21kOptimizer32bit2StateIfLi0EEvPT_S1_PfS2_S2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit2StateI13__nv_bfloat16Li0ELi4096ELi8EEvPT_S2_PfS3_S3_ffffiffi' for 'sm_80'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit2StateI13__nv_bfloat16Li0ELi4096ELi8EEvPT_S2_PfS3_S3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 55 registers, 424 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit2StateI6__halfLi0ELi4096ELi8EEvPT_S2_PfS3_S3_ffffiffi' for 'sm_80'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit2StateI6__halfLi0ELi4096ELi8EEvPT_S2_PfS3_S3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 55 registers, 424 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit2StateIfLi0ELi4096ELi8EEvPT_S1_PfS2_S2_ffffiffi' for 'sm_80'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit2StateIfLi0ELi4096ELi8EEvPT_S1_PfS2_S2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 56 registers, 424 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateIfLi4EEvPT_S1_PfS2_ffffffiffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateIfLi4EEvPT_S1_PfS2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI6__halfLi4EEvPT_S2_PfS3_ffffffiffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI6__halfLi4EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI13__nv_bfloat16Li5EEvPT_S2_PfS3_ffffffiffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI13__nv_bfloat16Li5EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateIfLi5EEvPT_S1_PfS2_ffffffiffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateIfLi5EEvPT_S1_PfS2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI6__halfLi5EEvPT_S2_PfS3_ffffffiffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI6__halfLi5EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateIfLi2EEvPT_S1_PfS2_ffffffiffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateIfLi2EEvPT_S1_PfS2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI6__halfLi2EEvPT_S2_PfS3_ffffffiffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI6__halfLi2EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateIfLi1EEvPT_S1_PfS2_ffffffiffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateIfLi1EEvPT_S1_PfS2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI6__halfLi1EEvPT_S2_PfS3_ffffffiffbi' for 'sm_80'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI6__halfLi1EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateIfLi4ELi4096ELi8EEvPT_S1_PfS2_ffffiffi' for 'sm_80'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateIfLi4ELi4096ELi8EEvPT_S1_PfS2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI6__halfLi4ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_80'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI6__halfLi4ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 46 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI13__nv_bfloat16Li5ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_80'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI13__nv_bfloat16Li5ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 46 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateIfLi5ELi4096ELi8EEvPT_S1_PfS2_ffffiffi' for 'sm_80'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateIfLi5ELi4096ELi8EEvPT_S1_PfS2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI6__halfLi5ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_80'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI6__halfLi5ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 46 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateIfLi2ELi4096ELi8EEvPT_S1_PfS2_ffffiffi' for 'sm_80'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateIfLi2ELi4096ELi8EEvPT_S1_PfS2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 47 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI6__halfLi2ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_80'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI6__halfLi2ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 46 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateIfLi1ELi4096ELi8EEvPT_S1_PfS2_ffffiffi' for 'sm_80'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateIfLi1ELi4096ELi8EEvPT_S1_PfS2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 47 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI6__halfLi1ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_80'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI6__halfLi1ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 46 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kEstimateQuantilesI6__halfEvPT_PffS1_i' for 'sm_80'\n", "ptxas info : Function properties for _Z18kEstimateQuantilesI6__halfEvPT_PffS1_i\n", " 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 82 registers, 380 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kEstimateQuantilesIfEvPT_PffS0_i' for 'sm_80'\n", "ptxas info : Function properties for _Z18kEstimateQuantilesIfEvPT_PffS0_i\n", " 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 83 registers, 380 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kDoubleRowColQuantILi64ELi4ELi16ELi256ELi1EEvP6__halfPfS2_PcS3_PiS4_S1_S4_fiii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kDoubleRowColQuantILi64ELi4ELi16ELi256ELi1EEvP6__halfPfS2_PcS3_PiS4_S1_S4_fiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kDoubleRowColQuantILi64ELi4ELi16ELi256ELi0EEvP6__halfPfS2_PcS3_PiS4_S1_S4_fiii' for 'sm_80'\n", "ptxas info : Function properties for _Z18kDoubleRowColQuantILi64ELi4ELi16ELi256ELi0EEvP6__halfPfS2_PcS3_PiS4_S1_S4_fiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z22kdequant_mm_int32_fp16ILi4ELi128ELi512EEvPiPfS1_P6__halfS1_S1_S3_iiii' for 'sm_80'\n", "ptxas info : Function properties for _Z22kdequant_mm_int32_fp16ILi4ELi128ELi512EEvPiPfS1_P6__halfS1_S1_S3_iiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 36 registers, 424 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi4EEvPcS0_iiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi4EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 30 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi4EEvPcS0_iiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi4EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi3EEvPcS0_iiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi3EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi3EEvPcS0_iiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi3EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi2EEvPcS0_iiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi2EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi2EEvPcS0_iiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi2EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveIaLi32ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveIaLi32ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveIaLi16ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveIaLi16ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveIaLi8ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveIaLi8ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveI6__halfLi32ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveI6__halfLi32ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 47 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveI6__halfLi16ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveI6__halfLi16ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveI6__halfLi8ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveI6__halfLi8ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 32 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z16kExtractOutliersILi4EEvPcPiS0_iiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z16kExtractOutliersILi4EEvPcPiS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 14 registers, 396 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z16kExtractOutliersILi3EEvPcPiS0_iiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z16kExtractOutliersILi3EEvPcPiS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 13 registers, 396 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kgemm_4bit_inferenceI6__halfLi160EEviiiPT_PhPfS2_iiii' for 'sm_80'\n", "ptxas info : Function properties for _Z20kgemm_4bit_inferenceI6__halfLi160EEviiiPT_PhPfS2_iiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kgemm_4bit_inferenceI6__halfLi128EEviiiPT_PhPfS2_iiii' for 'sm_80'\n", "ptxas info : Function properties for _Z20kgemm_4bit_inferenceI6__halfLi128EEviiiPT_PhPfS2_iiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi96EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi96EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi64EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi64EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi32EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi32EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi128EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi128EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi160EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi160EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi192EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi192EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi256EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi256EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi96EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi96EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi64EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi64EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi32EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi32EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi128EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi128EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi160EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi160EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi192EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi192EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi256EEviiiPT_S2_S2_iii' for 'sm_80'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi256EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 167 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z5kfuncIfLi2EEvPT_S1_S0_l' for 'sm_80'\n", "ptxas info : Function properties for _Z5kfuncIfLi2EEvPT_S1_S0_l\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 26 registers, 384 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z5kfuncIfLi1EEvPT_S1_S0_l' for 'sm_80'\n", "ptxas info : Function properties for _Z5kfuncIfLi1EEvPT_S1_S0_l\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 30 registers, 384 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z5kfuncIhLi0EEvPT_S1_S0_l' for 'sm_80'\n", "ptxas info : Function properties for _Z5kfuncIhLi0EEvPT_S1_S0_l\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 384 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z5kfuncIfLi0EEvPT_S1_S0_l' for 'sm_80'\n", "ptxas info : Function properties for _Z5kfuncIfLi0EEvPT_S1_S0_l\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 384 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z15kgetColRowStatsI6__halfLi64ELi4ELi16ELi256ELi1EEvPT_PfS3_Pifiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z15kgetColRowStatsI6__halfLi64ELi4ELi16ELi256ELi1EEvPT_PfS3_Pifiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 29 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z15kgetColRowStatsI6__halfLi64ELi4ELi16ELi256ELi0EEvPT_PfS3_Pifiiii' for 'sm_80'\n", "ptxas info : Function properties for _Z15kgetColRowStatsI6__halfLi64ELi4ELi16ELi256ELi0EEvPT_PfS3_Pifiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 27 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11kDequantizePfPhS_i' for 'sm_80'\n", "ptxas info : Function properties for _Z11kDequantizePfPhS_i\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 12 registers, 1024 bytes smem, 380 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z9kQuantizePfS_Phi' for 'sm_80'\n", "ptxas info : Function properties for _Z9kQuantizePfS_Phi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 51 registers, 21520 bytes smem, 380 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z22kHistogramScatterAdd2DPfPiS0_S_ii' for 'sm_80'\n", "ptxas info : Function properties for _Z22kHistogramScatterAdd2DPfPiS0_S_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 14 registers, 392 bytes cmem[0]\n", "ptxas info : Function properties for _Z9dQuantizeILi1EEhPfff\n", " 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads\n", "ptxas info : Function properties for _Z9dQuantizeILi0EEhPfff\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z12dQuantizeNF4f\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z14dDequantizeNF4h\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z15dhDequantizeNF4h\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z12dQuantizeFP4f\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z18dDequantizeFP4Treehf\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z15d2DequantizeFP4h\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z14dDequantizeFP4hf\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z9atomicMinPff\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z9atomicMaxPff\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas warning : Value of threads per SM for entry _Z9kQuantizePfS_Phi is out of range. .minnctapersm will be ignored\n", "ptxas info : 31 bytes gmem\n", "ptxas info : Compiling entry function '_ZN3cub11EmptyKernelIvEEvv' for 'sm_86'\n", "ptxas info : Function properties for _ZN3cub11EmptyKernelIvEEvv\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 4 registers, 352 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi4ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi4ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseIfLi4ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseIfLi4ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI13__nv_bfloat16Li5ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI13__nv_bfloat16Li5ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi5ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi5ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseIfLi5ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseIfLi5ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi2ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi2ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseIfLi2ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseIfLi2ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi1ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseI6__halfLi1ELi2048ELi8EEvPT_S2_PhfffifPfS4_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit1StateBlockwiseIfLi1ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit1StateBlockwiseIfLi1ELi2048ELi8EEvPT_S1_PhfffifPfS3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 74 registers, 432 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit2StateBlockwiseI13__nv_bfloat16Li0ELi2048ELi8EEvPT_S2_PhS3_fffifPfS4_S4_S4_ffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit2StateBlockwiseI13__nv_bfloat16Li0ELi2048ELi8EEvPT_S2_PhS3_fffifPfS4_S4_S4_ffbi\n", " 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads\n", "ptxas info : Used 80 registers, 456 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit2StateBlockwiseI6__halfLi0ELi2048ELi8EEvPT_S2_PhS3_fffifPfS4_S4_S4_ffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit2StateBlockwiseI6__halfLi0ELi2048ELi8EEvPT_S2_PhS3_fffifPfS4_S4_S4_ffbi\n", " 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads\n", "ptxas info : Used 80 registers, 456 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z35kOptimizerStatic8bit2StateBlockwiseIfLi0ELi2048ELi8EEvPT_S1_PhS2_fffifPfS3_S3_S3_ffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z35kOptimizerStatic8bit2StateBlockwiseIfLi0ELi2048ELi8EEvPT_S1_PhS2_fffifPfS3_S3_S3_ffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 80 registers, 456 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi2EEvPfPhS0_PT_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi2EEvPfPhS0_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi2EEvPfPhS1_PT_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi2EEvPfPhS1_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi0EEvPfPhS0_PT_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi0EEvPfPhS0_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi0EEvPfPhS1_PT_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi0EEvPfPhS1_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi1EEvPfPhS0_PT_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseIfLi512ELi64ELi8ELi1EEvPfPhS0_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi1EEvPfPhS1_PT_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z20kDequantizeBlockwiseI6__halfLi512ELi64ELi8ELi1EEvPfPhS1_PT_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 392 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 27 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 27 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 34 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 34 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi2EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 34 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 27 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 27 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 34 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 34 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi2EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 34 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 27 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 27 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 34 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 34 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi1EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 34 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 27 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 27 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 34 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 34 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi1EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 34 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi64ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 39 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi128ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 39 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi256ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 39 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi512ELi2ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 39 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi1024ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi2048ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi4096ELi4ELi1ELi0EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi4096ELi4ELi1ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseIfLi4096ELi4ELi0ELi0EEvPfPT_S0_PhS0_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi64ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 38 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi128ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 39 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi256ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 39 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi512ELi2ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 39 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi1024ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi2048ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi1ELi0EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi1ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kQuantizeBlockwiseI6__halfLi4096ELi4ELi0ELi0EEvPfPT_S1_PhS1_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 400 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z19kPercentileClippingI6__halfLi2048ELi4EEvPT_Pfii' for 'sm_86'\n", "ptxas info : Function properties for _Z19kPercentileClippingI6__halfLi2048ELi4EEvPT_Pfii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 37 registers, 376 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z19kPercentileClippingIfLi2048ELi4EEvPT_Pfii' for 'sm_86'\n", "ptxas info : Function properties for _Z19kPercentileClippingIfLi2048ELi4EEvPT_Pfii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 39 registers, 376 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit2StateIfLi0EEvPT_S1_PhS2_PKffffffifPfS5_S5_S5_S5_S5_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit2StateIfLi0EEvPT_S1_PhS2_PKffffffifPfS5_S5_S5_S5_S5_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 484 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit2StateI6__halfLi0EEvPT_S2_PhS3_PKffffffifPfS6_S6_S6_S6_S6_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit2StateI6__halfLi0EEvPT_S2_PhS3_PKffffffifPfS6_S6_S6_S6_S6_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 484 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit2StateIfLi0EEvPT_S1_PhS2_PffffiS3_S3_S3_S3_S3_S3_fi' for 'sm_86'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit2StateIfLi0EEvPT_S1_PhS2_PffffiS3_S3_S3_S3_S3_S3_fi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 115 registers, 464 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit2StateI6__halfLi0EEvPT_S2_PhS3_PffffiS4_S4_S4_S4_S4_S4_fi' for 'sm_86'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit2StateI6__halfLi0EEvPT_S2_PhS3_PffffiS4_S4_S4_S4_S4_S4_fi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 120 registers, 464 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateIfLi5EEvPT_S1_PhPKffffffifPfS5_S5_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateIfLi5EEvPT_S1_PhPKffffffifPfS5_S5_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateI6__halfLi5EEvPT_S2_PhPKffffffifPfS6_S6_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateI6__halfLi5EEvPT_S2_PhPKffffffifPfS6_S6_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateIfLi2EEvPT_S1_PhPKffffffifPfS5_S5_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateIfLi2EEvPT_S1_PhPKffffffifPfS5_S5_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 62 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateI6__halfLi2EEvPT_S2_PhPKffffffifPfS6_S6_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateI6__halfLi2EEvPT_S2_PhPKffffffifPfS6_S6_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateIfLi1EEvPT_S1_PhPKffffffifPfS5_S5_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateIfLi1EEvPT_S1_PhPKffffffifPfS5_S5_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z26kOptimizerStatic8bit1StateI6__halfLi1EEvPT_S2_PhPKffffffifPfS6_S6_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z26kOptimizerStatic8bit1StateI6__halfLi1EEvPT_S2_PhPKffffffifPfS6_S6_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 62 registers, 452 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateIfLi5EEvPT_S1_PhPffffiS3_S3_S3_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateIfLi5EEvPT_S1_PhPffffiS3_S3_S3_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi5EEvPT_S2_PhPffffiS4_S4_S4_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi5EEvPT_S2_PhPffffiS4_S4_S4_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateIfLi2EEvPT_S1_PhPffffiS3_S3_S3_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateIfLi2EEvPT_S1_PhPffffiS3_S3_S3_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi2EEvPT_S2_PhPffffiS4_S4_S4_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi2EEvPT_S2_PhPffffiS4_S4_S4_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateIfLi1EEvPT_S1_PhPffffiS3_S3_S3_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateIfLi1EEvPT_S1_PhPffffiS3_S3_S3_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi1EEvPT_S2_PhPffffiS4_S4_S4_ffi' for 'sm_86'\n", "ptxas info : Function properties for _Z38kPreconditionOptimizerStatic8bit1StateI6__halfLi1EEvPT_S2_PhPffffiS4_S4_S4_ffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit2StateI13__nv_bfloat16Li0EEvPT_S2_PfS3_S3_ffffffiffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z21kOptimizer32bit2StateI13__nv_bfloat16Li0EEvPT_S2_PfS3_S3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit2StateI6__halfLi0EEvPT_S2_PfS3_S3_ffffffiffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z21kOptimizer32bit2StateI6__halfLi0EEvPT_S2_PfS3_S3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 64 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit2StateIfLi0EEvPT_S1_PfS2_S2_ffffffiffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z21kOptimizer32bit2StateIfLi0EEvPT_S1_PfS2_S2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 63 registers, 436 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit2StateI13__nv_bfloat16Li0ELi4096ELi8EEvPT_S2_PfS3_S3_ffffiffi' for 'sm_86'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit2StateI13__nv_bfloat16Li0ELi4096ELi8EEvPT_S2_PfS3_S3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 55 registers, 424 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit2StateI6__halfLi0ELi4096ELi8EEvPT_S2_PfS3_S3_ffffiffi' for 'sm_86'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit2StateI6__halfLi0ELi4096ELi8EEvPT_S2_PfS3_S3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 55 registers, 424 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit2StateIfLi0ELi4096ELi8EEvPT_S1_PfS2_S2_ffffiffi' for 'sm_86'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit2StateIfLi0ELi4096ELi8EEvPT_S1_PfS2_S2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 56 registers, 424 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateIfLi4EEvPT_S1_PfS2_ffffffiffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateIfLi4EEvPT_S1_PfS2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI6__halfLi4EEvPT_S2_PfS3_ffffffiffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI6__halfLi4EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI13__nv_bfloat16Li5EEvPT_S2_PfS3_ffffffiffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI13__nv_bfloat16Li5EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateIfLi5EEvPT_S1_PfS2_ffffffiffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateIfLi5EEvPT_S1_PfS2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI6__halfLi5EEvPT_S2_PfS3_ffffffiffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI6__halfLi5EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateIfLi2EEvPT_S1_PfS2_ffffffiffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateIfLi2EEvPT_S1_PfS2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI6__halfLi2EEvPT_S2_PfS3_ffffffiffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI6__halfLi2EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateIfLi1EEvPT_S1_PfS2_ffffffiffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateIfLi1EEvPT_S1_PfS2_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kOptimizer32bit1StateI6__halfLi1EEvPT_S2_PfS3_ffffffiffbi' for 'sm_86'\n", "ptxas info : Function properties for _Z21kOptimizer32bit1StateI6__halfLi1EEvPT_S2_PfS3_ffffffiffbi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 428 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateIfLi4ELi4096ELi8EEvPT_S1_PfS2_ffffiffi' for 'sm_86'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateIfLi4ELi4096ELi8EEvPT_S1_PfS2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI6__halfLi4ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_86'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI6__halfLi4ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 46 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI13__nv_bfloat16Li5ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_86'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI13__nv_bfloat16Li5ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 46 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateIfLi5ELi4096ELi8EEvPT_S1_PfS2_ffffiffi' for 'sm_86'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateIfLi5ELi4096ELi8EEvPT_S1_PfS2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 48 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI6__halfLi5ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_86'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI6__halfLi5ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 46 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateIfLi2ELi4096ELi8EEvPT_S1_PfS2_ffffiffi' for 'sm_86'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateIfLi2ELi4096ELi8EEvPT_S1_PfS2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 47 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI6__halfLi2ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_86'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI6__halfLi2ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 46 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateIfLi1ELi4096ELi8EEvPT_S1_PfS2_ffffiffi' for 'sm_86'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateIfLi1ELi4096ELi8EEvPT_S1_PfS2_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 47 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z33kPreconditionOptimizer32bit1StateI6__halfLi1ELi4096ELi8EEvPT_S2_PfS3_ffffiffi' for 'sm_86'\n", "ptxas info : Function properties for _Z33kPreconditionOptimizer32bit1StateI6__halfLi1ELi4096ELi8EEvPT_S2_PfS3_ffffiffi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 46 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kEstimateQuantilesI6__halfEvPT_PffS1_i' for 'sm_86'\n", "ptxas info : Function properties for _Z18kEstimateQuantilesI6__halfEvPT_PffS1_i\n", " 16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 82 registers, 380 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kEstimateQuantilesIfEvPT_PffS0_i' for 'sm_86'\n", "ptxas info : Function properties for _Z18kEstimateQuantilesIfEvPT_PffS0_i\n", " 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 83 registers, 380 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kDoubleRowColQuantILi64ELi4ELi16ELi256ELi1EEvP6__halfPfS2_PcS3_PiS4_S1_S4_fiii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kDoubleRowColQuantILi64ELi4ELi16ELi256ELi1EEvP6__halfPfS2_PcS3_PiS4_S1_S4_fiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 38 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z18kDoubleRowColQuantILi64ELi4ELi16ELi256ELi0EEvP6__halfPfS2_PcS3_PiS4_S1_S4_fiii' for 'sm_86'\n", "ptxas info : Function properties for _Z18kDoubleRowColQuantILi64ELi4ELi16ELi256ELi0EEvP6__halfPfS2_PcS3_PiS4_S1_S4_fiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 36 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z22kdequant_mm_int32_fp16ILi4ELi128ELi512EEvPiPfS1_P6__halfS1_S1_S3_iiii' for 'sm_86'\n", "ptxas info : Function properties for _Z22kdequant_mm_int32_fp16ILi4ELi128ELi512EEvPiPfS1_P6__halfS1_S1_S3_iiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 37 registers, 424 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi4EEvPcS0_iiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi4EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 31 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi4EEvPcS0_iiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi4EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi3EEvPcS0_iiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi3EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi3EEvPcS0_iiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi3EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi2EEvPcS0_iiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi1ELi2EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi2EEvPcS0_iiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z21kTransformRowToFormatILi256ELi8ELi32ELi256ELi0ELi2EEvPcS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 388 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveIaLi32ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveIaLi32ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveIaLi16ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveIaLi16ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveIaLi8ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveIaLi8ELi8EEvPiS0_S0_S0_S0_P6__halfPT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveI6__halfLi32ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveI6__halfLi32ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveI6__halfLi16ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveI6__halfLi16ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z27kspmm_coo_very_sparse_naiveI6__halfLi8ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z27kspmm_coo_very_sparse_naiveI6__halfLi8ELi16EEvPiS1_S1_S1_S1_PS0_PT_S2_Pfiiii\n", " 192 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 40 registers, 440 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z16kExtractOutliersILi4EEvPcPiS0_iiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z16kExtractOutliersILi4EEvPcPiS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 14 registers, 396 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z16kExtractOutliersILi3EEvPcPiS0_iiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z16kExtractOutliersILi3EEvPcPiS0_iiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 13 registers, 396 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kgemm_4bit_inferenceI6__halfLi160EEviiiPT_PhPfS2_iiii' for 'sm_86'\n", "ptxas info : Function properties for _Z20kgemm_4bit_inferenceI6__halfLi160EEviiiPT_PhPfS2_iiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 72 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z20kgemm_4bit_inferenceI6__halfLi128EEviiiPT_PhPfS2_iiii' for 'sm_86'\n", "ptxas info : Function properties for _Z20kgemm_4bit_inferenceI6__halfLi128EEviiiPT_PhPfS2_iiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 72 registers, 416 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi96EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi96EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi64EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi64EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi32EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi32EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi128EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi128EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi160EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi160EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi192EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi192EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi16ELi256EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi16ELi256EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi96EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi96EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi64EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi64EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi32EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi32EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi128EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi128EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi160EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi160EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi192EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi192EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11gemm_deviceI6__halfLi32ELi256EEviiiPT_S2_S2_iii' for 'sm_86'\n", "ptxas info : Function properties for _Z11gemm_deviceI6__halfLi32ELi256EEviiiPT_S2_S2_iii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 168 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z5kfuncIfLi2EEvPT_S1_S0_l' for 'sm_86'\n", "ptxas info : Function properties for _Z5kfuncIfLi2EEvPT_S1_S0_l\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 384 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z5kfuncIfLi1EEvPT_S1_S0_l' for 'sm_86'\n", "ptxas info : Function properties for _Z5kfuncIfLi1EEvPT_S1_S0_l\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 30 registers, 384 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z5kfuncIhLi0EEvPT_S1_S0_l' for 'sm_86'\n", "ptxas info : Function properties for _Z5kfuncIhLi0EEvPT_S1_S0_l\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 384 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z5kfuncIfLi0EEvPT_S1_S0_l' for 'sm_86'\n", "ptxas info : Function properties for _Z5kfuncIfLi0EEvPT_S1_S0_l\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 24 registers, 384 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z15kgetColRowStatsI6__halfLi64ELi4ELi16ELi256ELi1EEvPT_PfS3_Pifiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z15kgetColRowStatsI6__halfLi64ELi4ELi16ELi256ELi1EEvPT_PfS3_Pifiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z15kgetColRowStatsI6__halfLi64ELi4ELi16ELi256ELi0EEvPT_PfS3_Pifiiii' for 'sm_86'\n", "ptxas info : Function properties for _Z15kgetColRowStatsI6__halfLi64ELi4ELi16ELi256ELi0EEvPT_PfS3_Pifiiii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 28 registers, 404 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z11kDequantizePfPhS_i' for 'sm_86'\n", "ptxas info : Function properties for _Z11kDequantizePfPhS_i\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 12 registers, 1024 bytes smem, 380 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z9kQuantizePfS_Phi' for 'sm_86'\n", "ptxas info : Function properties for _Z9kQuantizePfS_Phi\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 51 registers, 21520 bytes smem, 380 bytes cmem[0]\n", "ptxas info : Compiling entry function '_Z22kHistogramScatterAdd2DPfPiS0_S_ii' for 'sm_86'\n", "ptxas info : Function properties for _Z22kHistogramScatterAdd2DPfPiS0_S_ii\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Used 14 registers, 392 bytes cmem[0]\n", "ptxas info : Function properties for _Z9dQuantizeILi1EEhPfff\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z9dQuantizeILi0EEhPfff\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z12dQuantizeNF4f\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z14dDequantizeNF4h\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z15dhDequantizeNF4h\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z12dQuantizeFP4f\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z18dDequantizeFP4Treehf\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z15d2DequantizeFP4h\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z14dDequantizeFP4hf\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z9atomicMinPff\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "ptxas info : Function properties for _Z9atomicMaxPff\n", " 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n", "/usr/local/cuda/bin/nvcc -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -Xcompiler '-fPIC' -dlink /workspace/bitsandbytes/build/ops.o /workspace/bitsandbytes/build/kernels.o -o /workspace/bitsandbytes/build/link.o\n", "/usr/bin/g++ -std=c++14 -DBUILD_CUDA -shared -fPIC -I /usr/local/cuda/include -I /workspace/bitsandbytes/csrc -I /include -I /workspace/bitsandbytes/include /workspace/bitsandbytes/build/ops.o /workspace/bitsandbytes/build/kernels.o /workspace/bitsandbytes/build/link.o /workspace/bitsandbytes/csrc/common.cpp /workspace/bitsandbytes/csrc/cpu_ops.cpp /workspace/bitsandbytes/csrc/pythonInterface.c -o ./bitsandbytes/libbitsandbytes_cuda118.so -L /usr/local/cuda/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L /lib\n", "libs: ['libbitsandbytes_cuda118.so']\n", "running install\n", "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/setuptools/command/install.py:34: SetuptoolsDeprecationWarning: setup.py install is deprecated. Use build and pip and other standards-based tools.\n", " warnings.warn(\n", "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/setuptools/command/easy_install.py:144: EasyInstallDeprecationWarning: easy_install command is deprecated. Use build and pip and other standards-based tools.\n", " warnings.warn(\n", "running bdist_egg\n", "running egg_info\n", "creating bitsandbytes.egg-info\n", "writing bitsandbytes.egg-info/PKG-INFO\n", "writing dependency_links to bitsandbytes.egg-info/dependency_links.txt\n", "writing top-level names to bitsandbytes.egg-info/top_level.txt\n", "writing manifest file 'bitsandbytes.egg-info/SOURCES.txt'\n", "reading manifest file 'bitsandbytes.egg-info/SOURCES.txt'\n", "adding license file 'LICENSE'\n", "adding license file 'NOTICE.md'\n", "writing manifest file 'bitsandbytes.egg-info/SOURCES.txt'\n", "installing library code to build/bdist.linux-x86_64/egg\n", "running install_lib\n", "running build_py\n", "creating build/lib\n", "creating build/lib/bitsandbytes\n", "copying bitsandbytes/__init__.py -> build/lib/bitsandbytes\n", "copying bitsandbytes/__main__.py -> build/lib/bitsandbytes\n", "copying bitsandbytes/cextension.py -> build/lib/bitsandbytes\n", "copying bitsandbytes/functional.py -> build/lib/bitsandbytes\n", "copying bitsandbytes/utils.py -> build/lib/bitsandbytes\n", "creating build/lib/bitsandbytes/autograd\n", "copying bitsandbytes/autograd/__init__.py -> build/lib/bitsandbytes/autograd\n", "copying bitsandbytes/autograd/_functions.py -> build/lib/bitsandbytes/autograd\n", "creating build/lib/bitsandbytes/cuda_setup\n", "copying bitsandbytes/cuda_setup/__init__.py -> build/lib/bitsandbytes/cuda_setup\n", "copying bitsandbytes/cuda_setup/env_vars.py -> build/lib/bitsandbytes/cuda_setup\n", "copying bitsandbytes/cuda_setup/main.py -> build/lib/bitsandbytes/cuda_setup\n", "creating build/lib/bitsandbytes/nn\n", "copying bitsandbytes/nn/__init__.py -> build/lib/bitsandbytes/nn\n", "copying bitsandbytes/nn/modules.py -> build/lib/bitsandbytes/nn\n", "copying bitsandbytes/nn/triton_based_modules.py -> build/lib/bitsandbytes/nn\n", "creating build/lib/bitsandbytes/optim\n", "copying bitsandbytes/optim/__init__.py -> build/lib/bitsandbytes/optim\n", "copying bitsandbytes/optim/adagrad.py -> build/lib/bitsandbytes/optim\n", "copying bitsandbytes/optim/adam.py -> build/lib/bitsandbytes/optim\n", "copying bitsandbytes/optim/adamw.py -> build/lib/bitsandbytes/optim\n", "copying bitsandbytes/optim/lamb.py -> build/lib/bitsandbytes/optim\n", "copying bitsandbytes/optim/lars.py -> build/lib/bitsandbytes/optim\n", "copying bitsandbytes/optim/lion.py -> build/lib/bitsandbytes/optim\n", "copying bitsandbytes/optim/optimizer.py -> build/lib/bitsandbytes/optim\n", "copying bitsandbytes/optim/rmsprop.py -> build/lib/bitsandbytes/optim\n", "copying bitsandbytes/optim/sgd.py -> build/lib/bitsandbytes/optim\n", "creating build/lib/bitsandbytes/research\n", "copying bitsandbytes/research/__init__.py -> build/lib/bitsandbytes/research\n", "creating build/lib/bitsandbytes/triton\n", "copying bitsandbytes/triton/__init__.py -> build/lib/bitsandbytes/triton\n", "copying bitsandbytes/triton/dequantize_rowwise.py -> build/lib/bitsandbytes/triton\n", "copying bitsandbytes/triton/int8_matmul_mixed_dequanitze.py -> build/lib/bitsandbytes/triton\n", "copying bitsandbytes/triton/int8_matmul_rowwise_dequantize.py -> build/lib/bitsandbytes/triton\n", "copying bitsandbytes/triton/quantize_columnwise_and_transpose.py -> build/lib/bitsandbytes/triton\n", "copying bitsandbytes/triton/quantize_global.py -> build/lib/bitsandbytes/triton\n", "copying bitsandbytes/triton/quantize_rowwise.py -> build/lib/bitsandbytes/triton\n", "copying bitsandbytes/triton/triton_utils.py -> build/lib/bitsandbytes/triton\n", "creating build/lib/bitsandbytes/research/autograd\n", "copying bitsandbytes/research/autograd/__init__.py -> build/lib/bitsandbytes/research/autograd\n", "copying bitsandbytes/research/autograd/_functions.py -> build/lib/bitsandbytes/research/autograd\n", "creating build/lib/bitsandbytes/research/nn\n", "copying bitsandbytes/research/nn/__init__.py -> build/lib/bitsandbytes/research/nn\n", "copying bitsandbytes/research/nn/modules.py -> build/lib/bitsandbytes/research/nn\n", "copying bitsandbytes/libbitsandbytes_cuda118.so -> build/lib/bitsandbytes\n", "creating build/bdist.linux-x86_64\n", "creating build/bdist.linux-x86_64/egg\n", "creating build/bdist.linux-x86_64/egg/bitsandbytes\n", "copying build/lib/bitsandbytes/__init__.py -> build/bdist.linux-x86_64/egg/bitsandbytes\n", "copying build/lib/bitsandbytes/__main__.py -> build/bdist.linux-x86_64/egg/bitsandbytes\n", "copying build/lib/bitsandbytes/cextension.py -> build/bdist.linux-x86_64/egg/bitsandbytes\n", "copying build/lib/bitsandbytes/functional.py -> build/bdist.linux-x86_64/egg/bitsandbytes\n", "copying build/lib/bitsandbytes/utils.py -> build/bdist.linux-x86_64/egg/bitsandbytes\n", "creating build/bdist.linux-x86_64/egg/bitsandbytes/autograd\n", "copying build/lib/bitsandbytes/autograd/__init__.py -> build/bdist.linux-x86_64/egg/bitsandbytes/autograd\n", "copying build/lib/bitsandbytes/autograd/_functions.py -> build/bdist.linux-x86_64/egg/bitsandbytes/autograd\n", "creating build/bdist.linux-x86_64/egg/bitsandbytes/cuda_setup\n", "copying build/lib/bitsandbytes/cuda_setup/__init__.py -> build/bdist.linux-x86_64/egg/bitsandbytes/cuda_setup\n", "copying build/lib/bitsandbytes/cuda_setup/env_vars.py -> build/bdist.linux-x86_64/egg/bitsandbytes/cuda_setup\n", "copying build/lib/bitsandbytes/cuda_setup/main.py -> build/bdist.linux-x86_64/egg/bitsandbytes/cuda_setup\n", "creating build/bdist.linux-x86_64/egg/bitsandbytes/nn\n", "copying build/lib/bitsandbytes/nn/__init__.py -> build/bdist.linux-x86_64/egg/bitsandbytes/nn\n", "copying build/lib/bitsandbytes/nn/modules.py -> build/bdist.linux-x86_64/egg/bitsandbytes/nn\n", "copying build/lib/bitsandbytes/nn/triton_based_modules.py -> build/bdist.linux-x86_64/egg/bitsandbytes/nn\n", "creating build/bdist.linux-x86_64/egg/bitsandbytes/optim\n", "copying build/lib/bitsandbytes/optim/__init__.py -> build/bdist.linux-x86_64/egg/bitsandbytes/optim\n", "copying build/lib/bitsandbytes/optim/adagrad.py -> build/bdist.linux-x86_64/egg/bitsandbytes/optim\n", "copying build/lib/bitsandbytes/optim/adam.py -> build/bdist.linux-x86_64/egg/bitsandbytes/optim\n", "copying build/lib/bitsandbytes/optim/adamw.py -> build/bdist.linux-x86_64/egg/bitsandbytes/optim\n", "copying build/lib/bitsandbytes/optim/lamb.py -> build/bdist.linux-x86_64/egg/bitsandbytes/optim\n", "copying build/lib/bitsandbytes/optim/lars.py -> build/bdist.linux-x86_64/egg/bitsandbytes/optim\n", "copying build/lib/bitsandbytes/optim/lion.py -> build/bdist.linux-x86_64/egg/bitsandbytes/optim\n", "copying build/lib/bitsandbytes/optim/optimizer.py -> build/bdist.linux-x86_64/egg/bitsandbytes/optim\n", "copying build/lib/bitsandbytes/optim/rmsprop.py -> build/bdist.linux-x86_64/egg/bitsandbytes/optim\n", "copying build/lib/bitsandbytes/optim/sgd.py -> build/bdist.linux-x86_64/egg/bitsandbytes/optim\n", "creating build/bdist.linux-x86_64/egg/bitsandbytes/research\n", "copying build/lib/bitsandbytes/research/__init__.py -> build/bdist.linux-x86_64/egg/bitsandbytes/research\n", "creating build/bdist.linux-x86_64/egg/bitsandbytes/research/autograd\n", "copying build/lib/bitsandbytes/research/autograd/__init__.py -> build/bdist.linux-x86_64/egg/bitsandbytes/research/autograd\n", "copying build/lib/bitsandbytes/research/autograd/_functions.py -> build/bdist.linux-x86_64/egg/bitsandbytes/research/autograd\n", "creating build/bdist.linux-x86_64/egg/bitsandbytes/research/nn\n", "copying build/lib/bitsandbytes/research/nn/__init__.py -> build/bdist.linux-x86_64/egg/bitsandbytes/research/nn\n", "copying build/lib/bitsandbytes/research/nn/modules.py -> build/bdist.linux-x86_64/egg/bitsandbytes/research/nn\n", "creating build/bdist.linux-x86_64/egg/bitsandbytes/triton\n", "copying build/lib/bitsandbytes/triton/__init__.py -> build/bdist.linux-x86_64/egg/bitsandbytes/triton\n", "copying build/lib/bitsandbytes/triton/dequantize_rowwise.py -> build/bdist.linux-x86_64/egg/bitsandbytes/triton\n", "copying build/lib/bitsandbytes/triton/int8_matmul_mixed_dequanitze.py -> build/bdist.linux-x86_64/egg/bitsandbytes/triton\n", "copying build/lib/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py -> build/bdist.linux-x86_64/egg/bitsandbytes/triton\n", "copying build/lib/bitsandbytes/triton/quantize_columnwise_and_transpose.py -> build/bdist.linux-x86_64/egg/bitsandbytes/triton\n", "copying build/lib/bitsandbytes/triton/quantize_global.py -> build/bdist.linux-x86_64/egg/bitsandbytes/triton\n", "copying build/lib/bitsandbytes/triton/quantize_rowwise.py -> build/bdist.linux-x86_64/egg/bitsandbytes/triton\n", "copying build/lib/bitsandbytes/triton/triton_utils.py -> build/bdist.linux-x86_64/egg/bitsandbytes/triton\n", "copying build/lib/bitsandbytes/libbitsandbytes_cuda118.so -> build/bdist.linux-x86_64/egg/bitsandbytes\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/__init__.py to __init__.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/__main__.py to __main__.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/cextension.py to cextension.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/functional.py to functional.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/utils.py to utils.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/autograd/__init__.py to __init__.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/autograd/_functions.py to _functions.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/cuda_setup/__init__.py to __init__.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/cuda_setup/env_vars.py to env_vars.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/cuda_setup/main.py to main.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/nn/__init__.py to __init__.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/nn/modules.py to modules.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/nn/triton_based_modules.py to triton_based_modules.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/optim/__init__.py to __init__.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/optim/adagrad.py to adagrad.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/optim/adam.py to adam.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/optim/adamw.py to adamw.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/optim/lamb.py to lamb.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/optim/lars.py to lars.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/optim/lion.py to lion.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/optim/optimizer.py to optimizer.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/optim/rmsprop.py to rmsprop.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/optim/sgd.py to sgd.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/research/__init__.py to __init__.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/research/autograd/__init__.py to __init__.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/research/autograd/_functions.py to _functions.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/research/nn/__init__.py to __init__.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/research/nn/modules.py to modules.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/triton/__init__.py to __init__.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/triton/dequantize_rowwise.py to dequantize_rowwise.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/triton/int8_matmul_mixed_dequanitze.py to int8_matmul_mixed_dequanitze.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py to int8_matmul_rowwise_dequantize.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/triton/quantize_columnwise_and_transpose.py to quantize_columnwise_and_transpose.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/triton/quantize_global.py to quantize_global.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/triton/quantize_rowwise.py to quantize_rowwise.cpython-39.pyc\n", "byte-compiling build/bdist.linux-x86_64/egg/bitsandbytes/triton/triton_utils.py to triton_utils.cpython-39.pyc\n", "creating build/bdist.linux-x86_64/egg/EGG-INFO\n", "copying bitsandbytes.egg-info/PKG-INFO -> build/bdist.linux-x86_64/egg/EGG-INFO\n", "copying bitsandbytes.egg-info/SOURCES.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", "copying bitsandbytes.egg-info/dependency_links.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", "copying bitsandbytes.egg-info/top_level.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", "writing build/bdist.linux-x86_64/egg/EGG-INFO/native_libs.txt\n", "zip_safe flag not set; analyzing archive contents...\n", "bitsandbytes.cuda_setup.__pycache__.main.cpython-39: module references __file__\n", "creating dist\n", "creating 'dist/bitsandbytes-0.39.0-py3.9.egg' and adding 'build/bdist.linux-x86_64/egg' to it\n", "removing 'build/bdist.linux-x86_64/egg' (and everything under it)\n", "Processing bitsandbytes-0.39.0-py3.9.egg\n", "removing '/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg' (and everything under it)\n", "creating /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg\n", "Extracting bitsandbytes-0.39.0-py3.9.egg to /root/miniconda3/envs/py3.9/lib/python3.9/site-packages\n", "bitsandbytes 0.39.0 is already the active version in easy-install.pth\n", "\n", "Installed /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg\n", "Processing dependencies for bitsandbytes==0.39.0\n", "Finished processing dependencies for bitsandbytes==0.39.0\n", "\n", "===================================BUG REPORT===================================\n", "Welcome to bitsandbytes. For bug reports, please run\n", "\n", "python -m bitsandbytes\n", "\n", " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", "================================================================================\n", "bin /workspace/bitsandbytes/bitsandbytes/libbitsandbytes_cuda118.so\n", "/workspace/bitsandbytes/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib64'), PosixPath('/usr/local/nvidia/lib')}\n", " warn(msg)\n", "/workspace/bitsandbytes/bitsandbytes/cuda_setup/main.py:149: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n", " warn(msg)\n", "/workspace/bitsandbytes/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n", " warn(msg)\n", "/workspace/bitsandbytes/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIK1tFOFrWbmoa2ckCJYhzgBHKTSMeR/AeuScCCzugqlI utensilcandel@gmail.com')}\n", " warn(msg)\n", "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n", "/workspace/bitsandbytes/bitsandbytes/cuda_setup/main.py:149: UserWarning: Found duplicate ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] files: {PosixPath('/usr/local/cuda/lib64/libcudart.so.11.0'), PosixPath('/usr/local/cuda/lib64/libcudart.so')}.. We'll flip a coin and try one of these, in order to fail forward.\n", "Either way, this might cause trouble in the future:\n", "If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.\n", " warn(msg)\n", "CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0\n", "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n", "CUDA SETUP: Detected CUDA version 118\n", "CUDA SETUP: Loading binary /workspace/bitsandbytes/bitsandbytes/libbitsandbytes_cuda118.so...\n", "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n", "++++++++++++++++++ BUG REPORT INFORMATION ++++++++++++++++++\n", "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n", "\n", "++++++++++++++++++ /usr/local CUDA PATHS +++++++++++++++++++\n", "/usr/local/cuda-11.8/compat/libcuda.so\n", "/usr/local/cuda-11.8/targets/x86_64-linux/lib/libcudart.so\n", "/usr/local/cuda-11.8/targets/x86_64-linux/lib/stubs/libcuda.so\n", "\n", "+++++++++++++++ WORKING DIRECTORY CUDA PATHS +++++++++++++++\n", "/workspace/bitsandbytes/bitsandbytes/libbitsandbytes_cuda118.so\n", "/workspace/bitsandbytes/build/lib/bitsandbytes/libbitsandbytes_cuda118.so\n", "\n", "++++++++++++++++++ LD_LIBRARY CUDA PATHS +++++++++++++++++++\n", "\n", "++++++++++++++++++++++++++ OTHER +++++++++++++++++++++++++++\n", "COMPILED_WITH_CUDA = True\n", "COMPUTE_CAPABILITIES_PER_GPU = ['8.6']\n", "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n", "++++++++++++++++++++++ DEBUG INFO END ++++++++++++++++++++++\n", "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n", "\n", "Running a quick check that:\n", " + library is importable\n", " + CUDA function is callable\n", "\n", "\n", "WARNING: Please be sure to sanitize sensible info from any such env vars!\n", "\n", "SUCCESS!\n", "Installation was successful!\n" ] } ], "source": [ "!CUDA_VERSION=118 make cuda11x\n", "!python setup.py install\n", "!python -m bitsandbytes" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/workspace/axolotl\n" ] } ], "source": [ "%cd /workspace/axolotl" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n", "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:" ] }, { "name": "stdin", "output_type": "stream", "text": [ " ········\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import wandb\n", "wandb.login()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\n", "\n", "===================================BUG REPORT===================================\n", "Welcome to bitsandbytes. For bug reports, please run\n", "\n", "python -m bitsandbytes\n", "\n", " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", "================================================================================\n", "bin /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/libbitsandbytes_cuda118.so\n", "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib'), PosixPath('/usr/local/nvidia/lib64')}\n", " warn(msg)\n", "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n", " warn(msg)\n", "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n", " warn(msg)\n", "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIK1tFOFrWbmoa2ckCJYhzgBHKTSMeR/AeuScCCzugqlI utensilcandel@gmail.com')}\n", " warn(msg)\n", "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n", "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: Found duplicate ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] files: {PosixPath('/usr/local/cuda/lib64/libcudart.so.11.0'), PosixPath('/usr/local/cuda/lib64/libcudart.so')}.. We'll flip a coin and try one of these, in order to fail forward.\n", "Either way, this might cause trouble in the future:\n", "If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.\n", " warn(msg)\n", "CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0\n", "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n", "CUDA SETUP: Detected CUDA version 118\n", "CUDA SETUP: Loading binary /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/libbitsandbytes_cuda118.so...\n", "Setting ds_accelerator to cuda (auto detect)\n", "WARNING:root:`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model.\n", "INFO:root:loading tokenizer...\n", "Using pad_token, but it is not set yet.\n", "INFO:root:Loading prepared dataset from disk at last_run_prepared/546f844533b11547ae11d2027d0fd307...\n", "INFO:root:Prepared dataset loaded from disk...\n", "INFO:root:loading model and peft_config...\n", "INFO:root:converting PEFT model w/ prepare_model_for_int8_training\n", "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/peft/utils/other.py:76: FutureWarning: prepare_model_for_int8_training is deprecated and will be removed in a future version. Use prepare_model_for_kbit_training instead.\n", " warnings.warn(\n", "INFO:root:found linear modules: ['query_key_value', 'dense_h_to_4h', 'dense_4h_to_h', 'dense']\n", "trainable params: 12582912 || all params: 1324142592 || trainable%: 0.9502686550543342\n", "INFO:root:Compiling torch model\n", "INFO:root:Pre-saving adapter config to ./falcon-rw-1b\n", "INFO:root:Starting trainer...\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mutensil\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.3\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m/workspace/axolotl/wandb/run-20230529_100931-jgmqihyz\u001b[0m\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mvolcanic-paper-1\u001b[0m\n", "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/utensil/falcon-rw-1b\u001b[0m\n", "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/utensil/falcon-rw-1b/runs/jgmqihyz\u001b[0m\n", " 0%| | 0/108044 [00:00