registryVersion: 1.9.0 models: - name: Llama 3.3 70B Instruct displayName: Llama 3.3 70B Instruct modelHubID: llama-3-3-70b-instruct category: Language type: NGC description: Instruction-tuned model for multilingual chat, coding assistance, and synthetic data generation. Delivers strong performance across diverse benchmarks. requireLicense: true licenseAgreements: - label: Use Policy url: https://www.llama.com/llama3_3/use-policy/ - label: License Agreement url: https://www.llama.com/llama3_3/license/ modelVariants: - variantId: Llama 3.3 70B Instruct source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama-3.3-70b-instruct optimizationProfiles: - profileId: nim/meta/llama-3.3-70b-instruct:a100x4-throughput-bf16-sf8byh808a framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct A100x4 BF16 Throughput ngcMetadata: 00e6f59e1003f038ecee8e9aa3ab2d40745bef214c476a381b21886dd8383952: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 4 - key: GPU DEVICE value: 20B2:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 140GB - profileId: nim/meta/llama-3.3-70b-instruct:h200x4-latency-fp8-nju7sb1wcw framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H200x4 FP8 Latency ngcMetadata: 13a9a5e5b372db6e92ecd2523a1a5d8b8f6ebd3fa8849608481e05a596a38d9e: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 4 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:hf-5825c91-tool-calling-fix-checksum framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100_NVLx8 BF16 Latency ngcMetadata: 144fcde387869e92dfec8597f477ad671ee4424269e3e25cd16037c721bf925d: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H100_NVL - key: COUNT value: 8 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3.3-70b-instruct:hf-5825c91-tool-calling-fix-checksum framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100_NVLx4 BF16 Throughput ngcMetadata: 14654290e66815c15ef45c507c483a4bcc3a22fcc11a479083bce0a14b743b71: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100_NVL - key: COUNT value: 4 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3.3-70b-instruct:h100x8-latency-fp8-z88enisl8a framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100x8 FP8 Latency ngcMetadata: 233973ff86b33b1076b8d8dfbf1b1c292ad224ae2d9c8b18f28a44b6f6f42768: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: fp8 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 8 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 70GB - profileId: nim/meta/llama-3.3-70b-instruct:l40sx4-throughput-fp8-daydbgtrgg framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct L40Sx4 FP8 Throughput ngcMetadata: 3d0e5989f2fbc23e7d4504cd69269c9636deb61d0efc12225d3d59d54afea297: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 4 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:b200x2-latency-fp8-yoijdqa45a framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct B200x2 FP8 Latency ngcMetadata: 4950d30811e1e426e97cda69e6c03a8a4819db8aa4abf34722ced4542a1f6b52: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 2 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:hf-5825c91-tool-calling-fix-checksum framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100_NVLx2 FP8 Throughput ngcMetadata: 4c538175eb36814513f5c95c115c8ed15273f0cffda9d2d355a17f0f311f2fbd: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 2 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3.3-70b-instruct:hf-5825c91-tool-calling-fix-checksum framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100_NVLx8 FP8 Latency ngcMetadata: 582fd7bfbe504eb5ee4ded5254cced1d83ea2682a91b6dd6610af842be947ecc: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: fp8 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 8 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3.3-70b-instruct:l40sx8-throughput-bf16-essm4-kcrg framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct L40Sx8 BF16 Throughput ngcMetadata: 60b95dfcc3a17bf00cabb2da1a264f5e8757763d0ebe2a3a073c5c0fc7c078ec: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: throughput tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 8 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 151GB - profileId: nim/meta/llama-3.3-70b-instruct:a100x8-latency-bf16-i6bl589a3a framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct A100x8 BF16 Latency ngcMetadata: 646e2eff5f305302c1cd5fe873ef7c8172021d9948157163761817c4e36352d7: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 8 - key: GPU DEVICE value: 20B2:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 150GB - profileId: nim/meta/llama-3.3-70b-instruct:hf-5825c91-tool-calling-fix-checksum framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100_NVLx4 FP8 Throughput ngcMetadata: 6708ebad5077e24eaff0eabce1134feb16b2a35d2313567b94e3f27479a90544: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 4 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3.3-70b-instruct:h100x4-latency-fp8-tkp3aadetg framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100x4 FP8 Latency ngcMetadata: 6d6d2aebdecec52d7982746f98b00421cf53e10295a9ac7f993e4554fa164d10: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:h200x4-throughput-bf16-eltdntbjla framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H200x4 BF16 Throughput ngcMetadata: 6dc00fc21eb6d8de62d35c96eed22174e205fdb3db816dbe547deeb37fbdd9a8: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H200 - key: COUNT value: 4 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:h100x8-latency-bf16-kwqeyhgvua framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100x8 BF16 Latency ngcMetadata: 758482618a1f166cc4e620228600410a6f05649a05c1838d5a93572d44289b95: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 8 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 147GB - profileId: nim/meta/llama-3.3-70b-instruct:hf-5825c91-tool-calling-fix-checksum framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100_NVLx4 FP8 Latency ngcMetadata: 76fc388794dc368145a440d16d72c0ba70e4aecac09901fe4a2c06a767c7eb0d: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 4 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3.3-70b-instruct:h100x4-throughput-fp8-cqigo1kenw framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100x4 FP8 Throughput ngcMetadata: 7d8a02f47911fb7ddf1a6f6b09438f621b6057cb21098999484f09d5a5bb7b23: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:h100x2-throughput-fp8--pwiqokzsa framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100x2 FP8 Throughput ngcMetadata: 7ee2258631ed9d51ebfe5ab44bd547ae5777217686d87cc89c15d06ccdca4047: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:h100x4-throughput-bf16-ygpeeau-0q framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100x4 BF16 Throughput ngcMetadata: 7f99ed5107c79b938b0ef4fcf2dd21aac27281f71d41a0a7c46d649879d374f0: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 138GB - profileId: nim/meta/llama-3.3-70b-instruct:b200x1-throughput-fp8-sfrhca0ipw framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct B200x1 FP8 Throughput ngcMetadata: 8b87146e39b0305ae1d73bc053564d1b4b4c565f81aa5abe3e84385544ca9b60: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 1 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 68GB - profileId: nim/meta/llama-3.3-70b-instruct:b200x4-latency-fp8-jwv73nrwia framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct B200x4 FP8 Latency ngcMetadata: 9527145a2d1316a1e55581d1f6b0a45e394fe37b853ec5172dea14c2c9767d96: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 4 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:h200x4-latency-bf16-bdxpl7wu-g framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H200x4 BF16 Latency ngcMetadata: 99142c13a095af184ae20945a208a81fae8d650ac0fd91747b03148383f882cf: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H200 - key: COUNT value: 4 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:b200x4-throughput-bf16-dnxvrdjuta framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct B200x4 BF16 Throughput ngcMetadata: 9b4836e143f78d245cf161c16a225be11d3e8f9b2024b99dd76e5b2ac6cd7efd: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: B200 - key: COUNT value: 4 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:h200x1-throughput-fp8-9qirfnkola framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H200x1 FP8 Throughput ngcMetadata: af876a179190d1832143f8b4f4a71f640f3df07b0503259cedee3e3a8363aa96: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 1 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 68GB - profileId: nim/meta/llama-3.3-70b-instruct:h200x2-throughput-bf16-qxgo9ky1rq framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H200x2 BF16 Throughput ngcMetadata: b407d3df1db123ba8a4c98fb9f73790c01cd53a70fa0e0185814ad57a17cb72b: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H200 - key: COUNT value: 2 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:h200x2-throughput-fp8-j5rwrqq4aa framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H200x2 FP8 Throughput ngcMetadata: c91a755246cb08dd9aa6905bc40b7db552071d141a850be5a791b06eb4fb2ef8: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 2 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:h100x8-throughput-bf16-2i0l24npsg framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100x8 BF16 Throughput ngcMetadata: d128c772583bd10da4f31bf8e961893eb2b62363f3cecb94b5ef67d8bbd54665: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: throughput tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 8 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 147GB - profileId: nim/meta/llama-3.3-70b-instruct:hf-5825c91-tool-calling-fix-checksum framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H100_NVLx8 BF16 Throughput ngcMetadata: d14fa7bd1f4287e74b856fe3f0030312cc4d03b8fe35a8c8aaedf0140ac55067: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: throughput tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100_NVL - key: COUNT value: 8 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3.3-70b-instruct:b200x2-throughput-bf16-4qvbdeuv4a framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct B200x2 BF16 Throughput ngcMetadata: d33e8144476992a7d8d621d8e50cf66b89d254dc721aa2782e5a5a6f07b1af80: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: B200 - key: COUNT value: 2 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:hf-5825c91-tool-calling-fix-checksum framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct A100_SXM4_40GBx8 BF16 Throughput ngcMetadata: dc0f5f87ca37f69af7f525ac293c599cd0cbdaf8130da4d9e2ad63d376b12039: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: A100_SXM4_40GB gpu_device: 20b0:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: throughput tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A100_SXM4_40GB - key: COUNT value: 8 - key: GPU DEVICE value: 20B0:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3.3-70b-instruct:h200x2-latency-fp8-pgmrxe0j3g framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct H200x2 FP8 Latency ngcMetadata: e4f217a5fb016b570e34b8a8eb06051ccfef9534ba43da973bb7f678242eaa5f: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 2 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:b200x4-latency-bf16-mnjb4olhmw framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct B200x4 BF16 Latency ngcMetadata: f17543bf1ee65e4a5c485385016927efe49cbc068a6021573d83eacb32537f76: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: B200 - key: COUNT value: 4 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3.3-70b-instruct:hf-5825c91-tool-calling-fix-checksum framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct Generic NVIDIA GPUx8 BF16 ngcMetadata: 1d7b604f835f74791e6bfd843047fc00a5aef0f72954ca48ce963811fb6f3f09: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '8' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 8 - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3.3-70b-instruct:hf-5825c91-tool-calling-fix-checksum framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct Generic NVIDIA GPUx2 BF16 ngcMetadata: 375dc0ff86133c2a423fbe9ef46d8fdf12d6403b3caa3b8e70d7851a89fc90dd: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '2' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 2 - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3.3-70b-instruct:hf-5825c91-tool-calling-fix-checksum framework: TensorRT-LLM displayName: Llama 3.3 70B Instruct Generic NVIDIA GPUx4 BF16 ngcMetadata: 54946b08b79ecf9e7f2d5c000234bf2cce19c8fee21b243c1a084b03897e8c95: model: meta/llama-3.3-70b-instruct release: 1.8.5 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '4' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 4 - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB labels: - Llama - Meta - Chat - Text Generation - Large Language Model - NVIDIA Validated config: architectures: - Other modelType: llama license: NVIDIA AI Foundation Models Community License - name: Llama 3.3 Nemotron Super 49B V1 displayName: Llama 3.3 Nemotron Super 49B V1 modelHubID: nvidia/llama-3.3-nemotron-super-49b-v1 category: Language type: NGC description: Llama-3.3-Nemotron-Super-49B-v1 is a language model that can follow instructions, complete requests, and generate creative text formats. The Llama-3.3-Nemotron-Super-49B-v1 Large Language Model (LLM) is an instruct fine-tuned version of the Llama-Nemotron. requireLicense: true licenseAgreements: - label: Use Policy url: https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/ - label: License Agreement url: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/ modelVariants: - variantId: Llama 3.3 Nemotron Super 49B V1 source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/llama-3.3-nemotron-super-49b-v1 optimizationProfiles: - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:a100x4-throughput-bf16--d40eserlg framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 A100x4 BF16 Throughput ngcMetadata: 00e6f59e1003f038ecee8e9aa3ab2d40745bef214c476a381b21886dd8383952: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 4 - key: GPU DEVICE value: 20B2:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 101GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:hf-1a2cb80-nim-tool-use framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H100_NVLx8 BF16 Latency ngcMetadata: 144fcde387869e92dfec8597f477ad671ee4424269e3e25cd16037c721bf925d: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H100_NVL - key: COUNT value: 8 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 93GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:hf-1a2cb80-nim-tool-use framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H100_NVLx4 BF16 Throughput ngcMetadata: 14654290e66815c15ef45c507c483a4bcc3a22fcc11a479083bce0a14b743b71: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100_NVL - key: COUNT value: 4 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 93GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:h100x8-latency-fp8-sfw5xn1oba framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H100x8 FP8 Latency ngcMetadata: 233973ff86b33b1076b8d8dfbf1b1c292ad224ae2d9c8b18f28a44b6f6f42768: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: fp8 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 8 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 51GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:b200x2-throughput-fp8-kiq2efz-dq framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 B200x2 FP8 Throughput ngcMetadata: 26bd84b107a99415b474267bec4cbcf932fbb28e45d7fb4e4db2971506825888: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 2 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 49GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:l40sx4-throughput-fp8-dtuojeeekw framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 L40Sx4 FP8 Throughput ngcMetadata: 3d0e5989f2fbc23e7d4504cd69269c9636deb61d0efc12225d3d59d54afea297: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 4 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 50GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:hf-1a2cb80-nim-tool-use framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H100_NVLx2 FP8 Throughput ngcMetadata: 4c538175eb36814513f5c95c115c8ed15273f0cffda9d2d355a17f0f311f2fbd: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 2 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 93GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:hf-1a2cb80-nim-tool-use framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H100_NVLx1 FP8 Throughput ngcMetadata: 5811750e70b7e9f340f4d670c72fcbd5282e254aeb31f62fd4f937cfb9361007: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 1 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 93GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:hf-1a2cb80-nim-tool-use framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H100_NVLx8 FP8 Latency ngcMetadata: 582fd7bfbe504eb5ee4ded5254cced1d83ea2682a91b6dd6610af842be947ecc: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: fp8 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 8 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 93GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:a100x8-latency-bf16-96llyrpauw framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 A100x8 BF16 Latency ngcMetadata: 646e2eff5f305302c1cd5fe873ef7c8172021d9948157163761817c4e36352d7: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 8 - key: GPU DEVICE value: 20B2:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 111GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:l40sx8-latency-bf16-9kyxnmiu9w framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 L40Sx8 BF16 Latency ngcMetadata: 66341208a7bba7fdde341dcad4a654eecb27681d2e322ec10c4fde9970030c26: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 8 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 111GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:hf-1a2cb80-nim-tool-use framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H100_NVLx4 FP8 Throughput ngcMetadata: 6708ebad5077e24eaff0eabce1134feb16b2a35d2313567b94e3f27479a90544: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 4 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 93GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:h100x8-latency-bf16-prto0dmpjw framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H100x8 BF16 Latency ngcMetadata: 758482618a1f166cc4e620228600410a6f05649a05c1838d5a93572d44289b95: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 8 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 109GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:h100x1-throughput-fp8-kjzavt-3zq framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H100x1 FP8 Throughput ngcMetadata: 7b508014e846234db3cabe5c9f38568b4ee96694b60600a0b71c621dc70cacf3: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 49GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:h100x4-throughput-fp8-slqbwxm0vq framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H100x4 FP8 Throughput ngcMetadata: 7d8a02f47911fb7ddf1a6f6b09438f621b6057cb21098999484f09d5a5bb7b23: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 50GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:h100x2-throughput-fp8-4ocry3irow framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H100x2 FP8 Throughput ngcMetadata: 7ee2258631ed9d51ebfe5ab44bd547ae5777217686d87cc89c15d06ccdca4047: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 49GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:h100x4-throughput-bf16-fzhqywxh-a framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H100x4 BF16 Throughput ngcMetadata: 7f99ed5107c79b938b0ef4fcf2dd21aac27281f71d41a0a7c46d649879d374f0: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 100GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:b200x1-throughput-fp8-cpviqqa47q framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 B200x1 FP8 Throughput ngcMetadata: 8b87146e39b0305ae1d73bc053564d1b4b4c565f81aa5abe3e84385544ca9b60: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 1 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 49GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:b200x8-latency-fp8-jbthzwoarq framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 B200x8 FP8 Latency ngcMetadata: 8f9f165fc2a52b860b8eca20856e3bf5f6dc411ff3e2d1e617b1e4408a1d0191: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: fp8 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 8 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 50GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:b200x8-latency-bf16-glb4omvl8q framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 B200x8 BF16 Latency ngcMetadata: 91df8db9fbe818a6a9c3cb1779f151ac7bc70d4806924abdd591c7cf1bfee2f6: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: B200 - key: COUNT value: 8 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 108GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:a10gx8-throughput-bf16-ea3czux3aq framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 A10Gx8 BF16 Throughput ngcMetadata: 935ec3ac922bf54106311dfc6b3214a1651a26033b4f5007b6351fffb4058b7a: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: A10G gpu_device: 2237:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: throughput tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A10G - key: COUNT value: 8 - key: GPU DEVICE value: 2237:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 111GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:h200x4-latency-bf16-2v7ziveceg framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H200x4 BF16 Latency ngcMetadata: 99142c13a095af184ae20945a208a81fae8d650ac0fd91747b03148383f882cf: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H200 - key: COUNT value: 4 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 100GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:b200x4-throughput-bf16-wudyjwpk6w framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 B200x4 BF16 Throughput ngcMetadata: 9b4836e143f78d245cf161c16a225be11d3e8f9b2024b99dd76e5b2ac6cd7efd: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: B200 - key: COUNT value: 4 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 100GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:b200x4-throughput-fp8-mvpvygyr-g framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 B200x4 FP8 Throughput ngcMetadata: a9b23031714881187b3beddb0eaa526006c799def8fca0e7975721724296a9d2: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 4 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 50GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:l40sx4-throughput-bf16-gt01zn8w7a framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 L40Sx4 BF16 Throughput ngcMetadata: ab8f2faec3bcafc32efaf05acada4df4d8a171a759b4fb5c44d2d9d43a348764: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 4 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 101GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:h200x1-throughput-fp8-bj-uzcumnq framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H200x1 FP8 Throughput ngcMetadata: af876a179190d1832143f8b4f4a71f640f3df07b0503259cedee3e3a8363aa96: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 1 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 49GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:h200x2-throughput-bf16-cqdwimpbbw framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H200x2 BF16 Throughput ngcMetadata: b407d3df1db123ba8a4c98fb9f73790c01cd53a70fa0e0185814ad57a17cb72b: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H200 - key: COUNT value: 2 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 96GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:hf-1a2cb80-nim-tool-use framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 A100_SXM4_40GBx8 BF16 Throughput ngcMetadata: dc0f5f87ca37f69af7f525ac293c599cd0cbdaf8130da4d9e2ad63d376b12039: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: A100_SXM4_40GB gpu_device: 20b0:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: throughput tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A100_SXM4_40GB - key: COUNT value: 8 - key: GPU DEVICE value: 20B0:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 93GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:l40sx8-latency-fp8-ataopkp21a framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 L40Sx8 FP8 Latency ngcMetadata: e19c01f4cfb3b39ba19830f23fde73783d9c3044a5864bdee29e13c867a5382c: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: fp8 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 8 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 51GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:h200x2-latency-fp8-gljasu2ggw framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 H200x2 FP8 Latency ngcMetadata: e4f217a5fb016b570e34b8a8eb06051ccfef9534ba43da973bb7f678242eaa5f: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 2 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 49GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:hf-1a2cb80-nim-tool-use framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 GH200_480GBx1 FP8 Throughput ngcMetadata: f49b49f3d90159a594def51efd8595f1d618e288bca2721fe08e786a1ac67d04: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' gpu: GH200_480GB gpu_device: 2342:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: GH200_480GB - key: COUNT value: 1 - key: GPU DEVICE value: 2342:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 93GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:hf-1a2cb80-nim-tool-use framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 Generic NVIDIA GPUx8 BF16 ngcMetadata: 1d7b604f835f74791e6bfd843047fc00a5aef0f72954ca48ce963811fb6f3f09: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '8' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 8 - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 93GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:hf-1a2cb80-nim-tool-use framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 Generic NVIDIA GPUx2 BF16 ngcMetadata: 375dc0ff86133c2a423fbe9ef46d8fdf12d6403b3caa3b8e70d7851a89fc90dd: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '2' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 2 - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 93GB - profileId: nim/nvidia/llama-3.3-nemotron-super-49b-v1:hf-1a2cb80-nim-tool-use framework: TensorRT-LLM displayName: Llama 3.3 Nemotron Super 49B V1 Generic NVIDIA GPUx4 BF16 ngcMetadata: 54946b08b79ecf9e7f2d5c000234bf2cce19c8fee21b243c1a084b03897e8c95: model: nvidia/llama-3.3-nemotron-super-49b-v1 release: 1.8.6 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '4' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 4 - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 93GB labels: - Llama - Chatbots - Virtual Assistants - Large Language Model - NVIDIA Validated config: architectures: - Other modelType: llama license: NVIDIA AI Foundation Models Community License - name: Llama 3.1 Nemotron Nano 8b V1 displayName: Llama 3.1 Nemotron Nano 8b V1 modelHubID: llama-3.1-nemotron-nano-8b-v1 category: Language type: NGC description: Llama 3.1 Nemotron Nano 8B or 4B is a language model that can follow instructions, complete requests, and generate creative text formats. requireLicense: true licenseAgreements: - label: Use Policy url: https://llama.meta.com/llama3/use-policy/ - label: License Agreement url: https://llama.meta.com/llama3/license/ modelVariants: - variantId: Llama 3.1 Nemotron Nano 8b V1 source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/llama-3.1-nemotron-nano-8b-v1 optimizationProfiles: - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:a100x2-latency-bf16-zxsnn7zu2g framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 A100x2 BF16 Latency ngcMetadata: 2146fcf18ea0412d564c6ed21d2f727281b95361fd78ccfa3d0570ec1716e8db: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 2 - key: GPU DEVICE value: 20B2:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:a100x1-throughput-bf16-jfn07bk9ua framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 A100x1 BF16 Throughput ngcMetadata: 222d1729a785201e8a021b226d74d227d01418c41b556283ee1bdbf0a818bd94: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 1 - key: GPU DEVICE value: 20B2:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 16GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:hf-25.03.17-0508-tool-use-v2 framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 H100_NVLx1 BF16 Throughput ngcMetadata: 25b5e251d366671a4011eaada9872ad1d02b48acc33aa0637853a3e3c3caa516: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100_NVL - key: COUNT value: 1 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:h200x1-throughput-bf16-hqyhv2wimw framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 H200x1 BF16 Throughput ngcMetadata: 434e8d336fa23cbe151748d32b71e196d69f20d319ee8b59852a1ca31a48d311: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H200 - key: COUNT value: 1 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 16GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:hf-25.03.17-0508-tool-use-v2 framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 H100_NVLx1 FP8 Throughput ngcMetadata: 5811750e70b7e9f340f4d670c72fcbd5282e254aeb31f62fd4f937cfb9361007: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 1 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:h200x2-latency-bf16-q6opgs6yja framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 H200x2 BF16 Latency ngcMetadata: 6832a9395f54086162fd7b1c6cfaae17c7d1e535a60e2b7675504c9fc7b57689: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H200 - key: COUNT value: 2 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:h100x2-latency-fp8-zsiywmloya framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 H100x2 FP8 Latency ngcMetadata: 6c3f01dd2b2a56e3e83f70522e4195d3f2add70b28680082204bbb9d6150eb04: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:h100x1-throughput-fp8-5tn9pkgdbq framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 H100x1 FP8 Throughput ngcMetadata: 7b508014e846234db3cabe5c9f38568b4ee96694b60600a0b71c621dc70cacf3: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:l40sx4-latency-bf16-k3y094rsxq framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 L40Sx4 BF16 Latency ngcMetadata: 844ebe2b42df8de8ce66cbb6ecf43f90858ea7efc14ddf020cf1ae7450ae0c33: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 4 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 19GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:a10gx2-throughput-bf16-htgj9vhmiw framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 A10Gx2 BF16 Throughput ngcMetadata: 8a62b002be0b7f82c407e5ed45c50dabe654deca052b521a920682f918323d0d: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: A10G gpu_device: 2237:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A10G - key: COUNT value: 2 - key: GPU DEVICE value: 2237:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:l40sx2-throughput-bf16-qivaletdla framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 L40Sx2 BF16 Throughput ngcMetadata: 973a6bfbfc5d13fc5eb18f5011fab777a5bd257d5807e97f842a3364e82160dc: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 2 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:hf-25.03.17-0508-tool-use-v2 framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 H100_NVLx2 FP8 Latency ngcMetadata: a00ce1e782317cd19ed192dcb0ce26ab8b0c1da8928c33de8893897888ff7580: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 2 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:l40sx1-throughput-bf16-anodjae0ya framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 L40Sx1 BF16 Throughput ngcMetadata: ac5071bbd91efcc71dc486fcd5210779570868b3b8328b4abf7a408a58b5e57c: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 1 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 16GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:l40sx1-throughput-fp8-dbamkqep8q framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 L40Sx1 FP8 Throughput ngcMetadata: ad17776f4619854fccd50354f31132a558a1ca619930698fd184d6ccf5fe3c99: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 1 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:h200x1-throughput-fp8-mafkx9-zmq framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 H200x1 FP8 Throughput ngcMetadata: af876a179190d1832143f8b4f4a71f640f3df07b0503259cedee3e3a8363aa96: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 1 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:h100x2-latency-bf16-iq2eo5lxgw framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 H100x2 BF16 Latency ngcMetadata: b3d535c0a7eaaea089b087ae645417c0b32fd01e7e9d638217cc032e51e74fd0: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:hf-25.03.17-0508-tool-use-v2 framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 H100_NVLx2 BF16 Latency ngcMetadata: b7fad3b35b07d623fac6549078305b71d0e6e1d228a86fa0f7cfe4dbeca9151a: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H100_NVL - key: COUNT value: 2 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:l40sx2-latency-fp8-hkd8uidneq framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 L40Sx2 FP8 Latency ngcMetadata: c4ff823a8202af4b523274fb8c6cdd73fa8ee5af16391a6d36b17f714a3c71a0: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 2 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:h200x2-latency-fp8-a3-t7tca3g framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 H200x2 FP8 Latency ngcMetadata: e4f217a5fb016b570e34b8a8eb06051ccfef9534ba43da973bb7f678242eaa5f: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 2 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:h100x1-throughput-bf16-iugafozvdq framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 H100x1 BF16 Throughput ngcMetadata: e7dbd9a8ce6270d2ec649a0fecbcae9b5336566113525f20aee3809ba5e63856: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 16GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:l40sx2-latency-bf16-z1ujefobmq framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 L40Sx2 BF16 Latency ngcMetadata: fa36c3502e92c50f78a1906242f929864955e702b7dbfbdb19758fb7ee9aa811: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 2 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:hf-25.03.17-0508-tool-use-v2 framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 Generic NVIDIA GPUx2 BF16 ngcMetadata: 375dc0ff86133c2a423fbe9ef46d8fdf12d6403b3caa3b8e70d7851a89fc90dd: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '2' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 2 - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:hf-25.03.17-0508-tool-use-v2 framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 Generic NVIDIA GPUx4 BF16 ngcMetadata: 54946b08b79ecf9e7f2d5c000234bf2cce19c8fee21b243c1a084b03897e8c95: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '4' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 4 - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/nvidia/llama-3.1-nemotron-nano-8b-v1:hf-25.03.17-0508-tool-use-v2 framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 8B V1 Generic NVIDIA GPUx1 BF16 ngcMetadata: ac34857f8dcbd174ad524974248f2faf271bd2a0355643b2cf1490d0fe7787c2: model: nvidia/llama-3.1-nemotron-nano-8b-v1 release: 1.8.4 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '1' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 1 - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 15GB - variantId: Llama 3.1 Nemotron Nano 4b V1 source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/llama3.1-nemotron-nano-4b-v1.1 optimizationProfiles: - profileId: nim/nvidia/llama3.1-nemotron-nano-4b-v1.1:hf-9f834a8-fix-checksum framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 4B V1.1 Generic NVIDIA GPUx2 BF16 ngcMetadata: 375dc0ff86133c2a423fbe9ef46d8fdf12d6403b3caa3b8e70d7851a89fc90dd: model: nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1 release: 1.8.4 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '2' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 2 - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/nvidia/llama3.1-nemotron-nano-4b-v1.1:hf-9f834a8-fix-checksum framework: TensorRT-LLM displayName: Llama 3.1 Nemotron Nano 4B V1.1 Generic NVIDIA GPUx1 BF16 ngcMetadata: ac34857f8dcbd174ad524974248f2faf271bd2a0355643b2cf1490d0fe7787c2: model: nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1 release: 1.8.4 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '1' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 1 - key: NIM VERSION value: 1.8.4 - key: DOWNLOAD SIZE value: 9GB labels: - Llama - Meta - Text Generation - Large Language Model - NVIDIA Validated - Nemo config: architectures: - Other modelType: llama license: NVIDIA AI Foundation Models Community License - name: Llama 3.1 Instruct displayName: Llama 3.1 Instruct modelHubID: llama-3.1-instruct category: Language type: NGC description: Multilingual instruction-following model optimized for chat, reasoning, and text generation. Strong performance across diverse language benchmarks. requireLicense: true licenseAgreements: - label: Use Policy url: https://llama.meta.com/llama3/use-policy/ - label: License Agreement url: https://llama.meta.com/llama3/license/ modelVariants: - variantId: Llama 3.1 70B Instruct source: URL: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/llama-3_1-70b-instruct-nemo optimizationProfiles: - profileId: nim/meta/llama-3_1-70b-instruct:a100x4-throughput-bf16-w0uzw1gkbg framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct A100x4 BF16 Throughput ngcMetadata: 00e6f59e1003f038ecee8e9aa3ab2d40745bef214c476a381b21886dd8383952: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 4 - key: GPU DEVICE value: 20B2:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 140GB - profileId: nim/meta/llama-3_1-70b-instruct:h200x4-latency-fp8-ihwbqzj9ow framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H200x4 FP8 Latency ngcMetadata: 13a9a5e5b372db6e92ecd2523a1a5d8b8f6ebd3fa8849608481e05a596a38d9e: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 4 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:hf-1d54af3-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100_NVLx8 BF16 Latency ngcMetadata: 144fcde387869e92dfec8597f477ad671ee4424269e3e25cd16037c721bf925d: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H100_NVL - key: COUNT value: 8 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3_1-70b-instruct:hf-1d54af3-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100_NVLx4 BF16 Throughput ngcMetadata: 14654290e66815c15ef45c507c483a4bcc3a22fcc11a479083bce0a14b743b71: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100_NVL - key: COUNT value: 4 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3_1-70b-instruct:h100x8-latency-fp8-aecgnfbvhg framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100x8 FP8 Latency ngcMetadata: 233973ff86b33b1076b8d8dfbf1b1c292ad224ae2d9c8b18f28a44b6f6f42768: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: fp8 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 8 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 70GB - profileId: nim/meta/llama-3_1-70b-instruct:b200x2-throughput-fp8-xykupukdga framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct B200x2 FP8 Throughput ngcMetadata: 26bd84b107a99415b474267bec4cbcf932fbb28e45d7fb4e4db2971506825888: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 2 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 68GB - profileId: nim/meta/llama-3_1-70b-instruct:l40sx4-throughput-fp8-uw2s64w-qg framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct L40Sx4 FP8 Throughput ngcMetadata: 3d0e5989f2fbc23e7d4504cd69269c9636deb61d0efc12225d3d59d54afea297: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 4 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:b200x2-latency-fp8-mkjcj1u-4g framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct B200x2 FP8 Latency ngcMetadata: 4950d30811e1e426e97cda69e6c03a8a4819db8aa4abf34722ced4542a1f6b52: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 2 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:hf-1d54af3-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100_NVLx2 FP8 Throughput ngcMetadata: 4c538175eb36814513f5c95c115c8ed15273f0cffda9d2d355a17f0f311f2fbd: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 2 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3_1-70b-instruct:hf-1d54af3-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100_NVLx8 FP8 Latency ngcMetadata: 582fd7bfbe504eb5ee4ded5254cced1d83ea2682a91b6dd6610af842be947ecc: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: fp8 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 8 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3_1-70b-instruct:l40sx8-throughput-bf16-p1mhasfmgw framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct L40Sx8 BF16 Throughput ngcMetadata: 60b95dfcc3a17bf00cabb2da1a264f5e8757763d0ebe2a3a073c5c0fc7c078ec: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: throughput tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 8 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 151GB - profileId: nim/meta/llama-3_1-70b-instruct:a100x8-latency-bf16-b0vhtvjbxa framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct A100x8 BF16 Latency ngcMetadata: 646e2eff5f305302c1cd5fe873ef7c8172021d9948157163761817c4e36352d7: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 8 - key: GPU DEVICE value: 20B2:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 150GB - profileId: nim/meta/llama-3_1-70b-instruct:hf-1d54af3-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100_NVLx4 FP8 Throughput ngcMetadata: 6708ebad5077e24eaff0eabce1134feb16b2a35d2313567b94e3f27479a90544: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 4 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3_1-70b-instruct:h100x4-latency-fp8-a-vzebrdia framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100x4 FP8 Latency ngcMetadata: 6d6d2aebdecec52d7982746f98b00421cf53e10295a9ac7f993e4554fa164d10: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:h200x4-throughput-bf16-srhrzg3ziw framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H200x4 BF16 Throughput ngcMetadata: 6dc00fc21eb6d8de62d35c96eed22174e205fdb3db816dbe547deeb37fbdd9a8: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H200 - key: COUNT value: 4 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:h100x8-latency-bf16-z0pmfdyj0g framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100x8 BF16 Latency ngcMetadata: 758482618a1f166cc4e620228600410a6f05649a05c1838d5a93572d44289b95: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 8 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 147GB - profileId: nim/meta/llama-3_1-70b-instruct:hf-1d54af3-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100_NVLx4 FP8 Latency ngcMetadata: 76fc388794dc368145a440d16d72c0ba70e4aecac09901fe4a2c06a767c7eb0d: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 4 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3_1-70b-instruct:h100x4-throughput-fp8-xuwerjgoba framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100x4 FP8 Throughput ngcMetadata: 7d8a02f47911fb7ddf1a6f6b09438f621b6057cb21098999484f09d5a5bb7b23: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:h100x2-throughput-fp8-bgcmw4su3w framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100x2 FP8 Throughput ngcMetadata: 7ee2258631ed9d51ebfe5ab44bd547ae5777217686d87cc89c15d06ccdca4047: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:h100x4-throughput-bf16-mimvphw4mg framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100x4 BF16 Throughput ngcMetadata: 7f99ed5107c79b938b0ef4fcf2dd21aac27281f71d41a0a7c46d649879d374f0: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 138GB - profileId: nim/meta/llama-3_1-70b-instruct:b200x1-throughput-fp8-pqzaqotuoq framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct B200x1 FP8 Throughput ngcMetadata: 8b87146e39b0305ae1d73bc053564d1b4b4c565f81aa5abe3e84385544ca9b60: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 1 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 68GB - profileId: nim/meta/llama-3_1-70b-instruct:a10gx8-throughput-bf16-iklvwtod4w framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct A10Gx8 BF16 Throughput ngcMetadata: 935ec3ac922bf54106311dfc6b3214a1651a26033b4f5007b6351fffb4058b7a: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: A10G gpu_device: 2237:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: throughput tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A10G - key: COUNT value: 8 - key: GPU DEVICE value: 2237:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 150GB - profileId: nim/meta/llama-3_1-70b-instruct:b200x4-latency-fp8-j5gt1gjpha framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct B200x4 FP8 Latency ngcMetadata: 9527145a2d1316a1e55581d1f6b0a45e394fe37b853ec5172dea14c2c9767d96: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 4 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:h200x4-latency-bf16-xsddatmm2w framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H200x4 BF16 Latency ngcMetadata: 99142c13a095af184ae20945a208a81fae8d650ac0fd91747b03148383f882cf: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H200 - key: COUNT value: 4 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:b200x4-throughput-bf16-jpnclv9i-w framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct B200x4 BF16 Throughput ngcMetadata: 9b4836e143f78d245cf161c16a225be11d3e8f9b2024b99dd76e5b2ac6cd7efd: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: B200 - key: COUNT value: 4 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:h200x1-throughput-fp8-e2wepn6pma framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H200x1 FP8 Throughput ngcMetadata: af876a179190d1832143f8b4f4a71f640f3df07b0503259cedee3e3a8363aa96: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 1 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 68GB - profileId: nim/meta/llama-3_1-70b-instruct:h200x2-throughput-bf16-qe3ldz912g framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H200x2 BF16 Throughput ngcMetadata: b407d3df1db123ba8a4c98fb9f73790c01cd53a70fa0e0185814ad57a17cb72b: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H200 - key: COUNT value: 2 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:h200x2-throughput-fp8-qaimoqhvoq framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H200x2 FP8 Throughput ngcMetadata: c91a755246cb08dd9aa6905bc40b7db552071d141a850be5a791b06eb4fb2ef8: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 2 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:h100x8-throughput-bf16-o1xmgf-zsg framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100x8 BF16 Throughput ngcMetadata: d128c772583bd10da4f31bf8e961893eb2b62363f3cecb94b5ef67d8bbd54665: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: throughput tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 8 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 147GB - profileId: nim/meta/llama-3_1-70b-instruct:hf-1d54af3-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H100_NVLx8 BF16 Throughput ngcMetadata: d14fa7bd1f4287e74b856fe3f0030312cc4d03b8fe35a8c8aaedf0140ac55067: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: throughput tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100_NVL - key: COUNT value: 8 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3_1-70b-instruct:b200x2-throughput-bf16-mcpnmtluwq framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct B200x2 BF16 Throughput ngcMetadata: d33e8144476992a7d8d621d8e50cf66b89d254dc721aa2782e5a5a6f07b1af80: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: B200 - key: COUNT value: 2 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:hf-1d54af3-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct A100_SXM4_40GBx8 BF16 Throughput ngcMetadata: dc0f5f87ca37f69af7f525ac293c599cd0cbdaf8130da4d9e2ad63d376b12039: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: A100_SXM4_40GB gpu_device: 20b0:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: throughput tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A100_SXM4_40GB - key: COUNT value: 8 - key: GPU DEVICE value: 20B0:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3_1-70b-instruct:h200x2-latency-fp8-gi3mfprtxq framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct H200x2 FP8 Latency ngcMetadata: e4f217a5fb016b570e34b8a8eb06051ccfef9534ba43da973bb7f678242eaa5f: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 2 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:b200x4-latency-bf16-la2mlox8dg framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct B200x4 BF16 Latency ngcMetadata: f17543bf1ee65e4a5c485385016927efe49cbc068a6021573d83eacb32537f76: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: B200 - key: COUNT value: 4 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/meta/llama-3_1-70b-instruct:hf-1d54af3-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct Generic NVIDIA GPUx8 BF16 ngcMetadata: 1d7b604f835f74791e6bfd843047fc00a5aef0f72954ca48ce963811fb6f3f09: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '8' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 8 - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3_1-70b-instruct:hf-1d54af3-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct Generic NVIDIA GPUx2 BF16 ngcMetadata: 375dc0ff86133c2a423fbe9ef46d8fdf12d6403b3caa3b8e70d7851a89fc90dd: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '2' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 2 - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - profileId: nim/meta/llama-3_1-70b-instruct:hf-1d54af3-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 70B Instruct Generic NVIDIA GPUx4 BF16 ngcMetadata: 54946b08b79ecf9e7f2d5c000234bf2cce19c8fee21b243c1a084b03897e8c95: model: meta/llama-3.1-70b-instruct release: 1.8.5 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '4' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 4 - key: NIM VERSION value: 1.8.5 - key: DOWNLOAD SIZE value: 132GB - variantId: Llama 3.1 8B Instruct source: URL: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/llama-3_1-8b-instruct-nemo optimizationProfiles: - profileId: nim/meta/llama-3.1-8b-instruct:a100x2-latency-bf16-dxn2qkwphq framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct A100x2 BF16 Latency ngcMetadata: 2146fcf18ea0412d564c6ed21d2f727281b95361fd78ccfa3d0570ec1716e8db: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 2 - key: GPU DEVICE value: 20B2:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/meta/llama-3.1-8b-instruct:a100x1-throughput-bf16-wlgvs1umtg framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct A100x1 BF16 Throughput ngcMetadata: 222d1729a785201e8a021b226d74d227d01418c41b556283ee1bdbf0a818bd94: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 1 - key: GPU DEVICE value: 20B2:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 16GB - profileId: nim/meta/llama-3.1-8b-instruct:hf-8c22764-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct H100_NVLx1 BF16 Throughput ngcMetadata: 25b5e251d366671a4011eaada9872ad1d02b48acc33aa0637853a3e3c3caa516: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100_NVL - key: COUNT value: 1 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/meta/llama-3.1-8b-instruct:h200x1-throughput-bf16-o8q-ystghg framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct H200x1 BF16 Throughput ngcMetadata: 434e8d336fa23cbe151748d32b71e196d69f20d319ee8b59852a1ca31a48d311: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H200 - key: COUNT value: 1 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 16GB - profileId: nim/meta/llama-3.1-8b-instruct:b200x2-latency-fp8-jg84ho12tg framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct B200x2 FP8 Latency ngcMetadata: 4950d30811e1e426e97cda69e6c03a8a4819db8aa4abf34722ced4542a1f6b52: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 2 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/meta/llama-3.1-8b-instruct:hf-8c22764-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct H100_NVLx1 FP8 Throughput ngcMetadata: 5811750e70b7e9f340f4d670c72fcbd5282e254aeb31f62fd4f937cfb9361007: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 1 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/meta/llama-3.1-8b-instruct:h200x2-latency-bf16-1to0kzerqq framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct H200x2 BF16 Latency ngcMetadata: 6832a9395f54086162fd7b1c6cfaae17c7d1e535a60e2b7675504c9fc7b57689: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H200 - key: COUNT value: 2 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/meta/llama-3.1-8b-instruct:h100x2-latency-fp8-ebdyccaccw framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct H100x2 FP8 Latency ngcMetadata: 6c3f01dd2b2a56e3e83f70522e4195d3f2add70b28680082204bbb9d6150eb04: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/meta/llama-3.1-8b-instruct:h100x1-throughput-fp8-qaruykck6q framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct H100x1 FP8 Throughput ngcMetadata: 7b508014e846234db3cabe5c9f38568b4ee96694b60600a0b71c621dc70cacf3: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/meta/llama-3.1-8b-instruct:l40sx4-latency-bf16-zffhxt5r8w framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct L40Sx4 BF16 Latency ngcMetadata: 844ebe2b42df8de8ce66cbb6ecf43f90858ea7efc14ddf020cf1ae7450ae0c33: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 4 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 19GB - profileId: nim/meta/llama-3.1-8b-instruct:b200x1-throughput-fp8-y2ykzepf-w framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct B200x1 FP8 Throughput ngcMetadata: 8b87146e39b0305ae1d73bc053564d1b4b4c565f81aa5abe3e84385544ca9b60: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 1 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/meta/llama-3.1-8b-instruct:l40sx2-throughput-bf16-bvf3cu1zyg framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct L40Sx2 BF16 Throughput ngcMetadata: 973a6bfbfc5d13fc5eb18f5011fab777a5bd257d5807e97f842a3364e82160dc: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 2 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/meta/llama-3.1-8b-instruct:hf-8c22764-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct H100_NVLx2 FP8 Latency ngcMetadata: a00ce1e782317cd19ed192dcb0ce26ab8b0c1da8928c33de8893897888ff7580: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 2 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/meta/llama-3.1-8b-instruct:b200x1-throughput-bf16-hirlxvtedg framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct B200x1 BF16 Throughput ngcMetadata: a4c63a91bccf635b570ddb6d14eeb6e7d0acb2389712892b08d21fad2ceaee38: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: B200 - key: COUNT value: 1 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 16GB - profileId: nim/meta/llama-3.1-8b-instruct:l40sx1-throughput-bf16-twgrayi1-g framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct L40Sx1 BF16 Throughput ngcMetadata: ac5071bbd91efcc71dc486fcd5210779570868b3b8328b4abf7a408a58b5e57c: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 1 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 16GB - profileId: nim/meta/llama-3.1-8b-instruct:l40sx1-throughput-fp8-ao4io-s5ow framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct L40Sx1 FP8 Throughput ngcMetadata: ad17776f4619854fccd50354f31132a558a1ca619930698fd184d6ccf5fe3c99: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 1 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/meta/llama-3.1-8b-instruct:h200x1-throughput-fp8-rauvqtnsoq framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct H200x1 FP8 Throughput ngcMetadata: af876a179190d1832143f8b4f4a71f640f3df07b0503259cedee3e3a8363aa96: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 1 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/meta/llama-3.1-8b-instruct:h100x2-latency-bf16-lsdwkmc2tg framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct H100x2 BF16 Latency ngcMetadata: b3d535c0a7eaaea089b087ae645417c0b32fd01e7e9d638217cc032e51e74fd0: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/meta/llama-3.1-8b-instruct:hf-8c22764-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct H100_NVLx2 BF16 Latency ngcMetadata: b7fad3b35b07d623fac6549078305b71d0e6e1d228a86fa0f7cfe4dbeca9151a: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H100_NVL - key: COUNT value: 2 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/meta/llama-3.1-8b-instruct:l40sx2-latency-fp8-tu6g7vu05a framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct L40Sx2 FP8 Latency ngcMetadata: c4ff823a8202af4b523274fb8c6cdd73fa8ee5af16391a6d36b17f714a3c71a0: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 2 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/meta/llama-3.1-8b-instruct:h200x2-latency-fp8-wlbch9i5qg framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct H200x2 FP8 Latency ngcMetadata: e4f217a5fb016b570e34b8a8eb06051ccfef9534ba43da973bb7f678242eaa5f: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 2 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/meta/llama-3.1-8b-instruct:h100x1-throughput-bf16-jiqjalwe7w framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct H100x1 BF16 Throughput ngcMetadata: e7dbd9a8ce6270d2ec649a0fecbcae9b5336566113525f20aee3809ba5e63856: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 16GB - profileId: nim/meta/llama-3.1-8b-instruct:b200x2-latency-bf16-vunbkvteua framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct B200x2 BF16 Latency ngcMetadata: f44768c625db71a327cf17e750d5e1a8e60171a8d8ef6b4c1c4b57fe74c9bf46: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: B200 - key: COUNT value: 2 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/meta/llama-3.1-8b-instruct:l40sx2-latency-bf16-jrvo7gca9g framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct L40Sx2 BF16 Latency ngcMetadata: fa36c3502e92c50f78a1906242f929864955e702b7dbfbdb19758fb7ee9aa811: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 2 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/meta/llama-3.1-8b-instruct:hf-8c22764-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct GH200_480GBx1 FP8 Throughput ngcMetadata: f49b49f3d90159a594def51efd8595f1d618e288bca2721fe08e786a1ac67d04: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: GH200_480GB gpu_device: 2342:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: GH200_480GB - key: COUNT value: 1 - key: GPU DEVICE value: 2342:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/meta/llama-3.1-8b-instruct:hf-8c22764-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct GH200_480GBx1 BF16 Throughput ngcMetadata: f7f74ecd523cd63065a50016a8786a893b9b1efe0d313bc5bcc54682f56e55fe: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: GH200_480GB gpu_device: 2342:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: GH200_480GB - key: COUNT value: 1 - key: GPU DEVICE value: 2342:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/meta/llama-3.1-8b-instruct:a10gx2-throughput-bf16-7c0u0kiiqw framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct A10Gx2 BF16 Throughput ngcMetadata: 8a62b002be0b7f82c407e5ed45c50dabe654deca052b521a920682f918323d0d: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: A10G gpu_device: 2237:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A10G - key: COUNT value: 2 - key: GPU DEVICE value: 2237:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/meta/llama-3.1-8b-instruct:hf-8c22764-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct Generic NVIDIA GPUx2 BF16 ngcMetadata: 375dc0ff86133c2a423fbe9ef46d8fdf12d6403b3caa3b8e70d7851a89fc90dd: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '2' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 2 - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/meta/llama-3.1-8b-instruct:hf-8c22764-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct Generic NVIDIA GPUx4 BF16 ngcMetadata: 54946b08b79ecf9e7f2d5c000234bf2cce19c8fee21b243c1a084b03897e8c95: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '4' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 4 - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/meta/llama-3.1-8b-instruct:hf-8c22764-nim1.3b framework: TensorRT-LLM displayName: Llama 3.1 8B Instruct Generic NVIDIA GPUx1 BF16 ngcMetadata: ac34857f8dcbd174ad524974248f2faf271bd2a0355643b2cf1490d0fe7787c2: model: meta/llama-3.1-8b-instruct release: 1.8.6 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '1' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 1 - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 15GB labels: - Llama - Meta - Text Generation - Large Language Model - TensorRT-LLM - Language Generation - NeMo - NVIDIA Validated config: architectures: - Other modelType: llama license: NVIDIA AI Foundation Models Community License - name: StarCoder2-7B displayName: StarCoder2-7B modelHubID: starcoder2-7b category: Code type: NGC description: Code generation model trained on 17 programming languages. The model is optimized for code completion, synthesis, and fill-in-the-middle objective. requireLicense: true licenseAgreements: - label: Use Policy url: https://llama.meta.com/llama3/use-policy/ - label: License Agreement url: https://llama.meta.com/llama3/license/ modelVariants: - variantId: StarCoder2-7B source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/bigcode/containers/starcoder2-7b optimizationProfiles: - profileId: nim/bigcode/starcoder2-7b:hf-bb9afde framework: TensorRT-LLM displayName: StarCoder2-7B A10Gx2 BF16 ngcMetadata: 375dc0ff86133c2a423fbe9ef46d8fdf12d6403b3caa3b8e70d7851a89fc90dd: model: bigcode/starcoder2-7b release: 1.8.1 tags: feat_lora: 'false' llm_engine: tensorrt_llm gpu: A10G gpu_device: 2237:10de pp: '1' precision: bf16 tp: '2' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: GPU value: A10G - key: GPU DEVICE value: 2237:10de - key: COUNT value: 2 - key: NIM VERSION value: 1.8.1 - key: DOWNLOAD SIZE value: 14GB - profileId: nim/bigcode/starcoder2-7b:h100x2-latency-fp8-zxtdqz4nva framework: TensorRT-LLM displayName: StarCoder2-7B H100x2 FP8 Latency ngcMetadata: 6c3f01dd2b2a56e3e83f70522e4195d3f2add70b28680082204bbb9d6150eb04: model: bigcode/starcoder2-7b release: 1.8.1 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.8.1 - key: DOWNLOAD SIZE value: 8GB - profileId: nim/bigcode/starcoder2-7b:h100x1-throughput-fp8-gxzrmbzlca framework: TensorRT-LLM displayName: StarCoder2-7B H100 FP8 Throughput ngcMetadata: 7b508014e846234db3cabe5c9f38568b4ee96694b60600a0b71c621dc70cacf3: model: bigcode/starcoder2-7b release: 1.8.1 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.8.1 - key: DOWNLOAD SIZE value: 8GB - profileId: nim/bigcode/starcoder2-7b:hf-bb9afde framework: TensorRT-LLM displayName: StarCoder2-7B L40S BF16 ngcMetadata: ac34857f8dcbd174ad524974248f2faf271bd2a0355643b2cf1490d0fe7787c2: model: bigcode/starcoder2-7b release: 1.8.1 tags: feat_lora: 'false' llm_engine: tensorrt_llm gpu: L40S gpu_device: 26b9:10de pp: '1' precision: bf16 tp: '1' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: GPU value: L40S - key: GPU DEVICE value: 26b9:10de - key: COUNT value: 1 - key: NIM VERSION value: 1.8.1 - key: DOWNLOAD SIZE value: 14GB - profileId: nim/bigcode/starcoder2-7b:h100x2-latency-bf16-tqld74axpq framework: TensorRT-LLM displayName: StarCoder2-7B H100x2 BF16 Latency ngcMetadata: b3d535c0a7eaaea089b087ae645417c0b32fd01e7e9d638217cc032e51e74fd0: model: bigcode/starcoder2-7b release: 1.8.1 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.8.1 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/bigcode/starcoder2-7b:h100x1-throughput-bf16-bouv9kemrw framework: TensorRT-LLM displayName: StarCoder2-7B H100 BF16 Throughput ngcMetadata: e7dbd9a8ce6270d2ec649a0fecbcae9b5336566113525f20aee3809ba5e63856: model: bigcode/starcoder2-7b release: 1.8.1 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.8.1 - key: DOWNLOAD SIZE value: 15GB labels: - bigCode - StarCoder - "Code Generation" - "Text Generation" - "Multilingual support" - Large Language Model - NVIDIA Validated config: architectures: - Other modelType: llama license: NVIDIA AI Foundation Models Community License - name: Mistral Instruct displayName: Mistral Instruct modelHubID: mistral-instruct category: Language type: NGC description: Efficient instruction-tuned model for broad text generation and instruction following. Compact size makes it easy to fine-tune for specialized use cases. modelVariants: - variantId: Mistral 7B Instruct displayName: Mistral 7B Instruct source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/mistralai/containers/mistral-7b-instruct-v0.3 optimizationProfiles: - profileId: nim/mistralai/mistral-7b-instruct-v0-3:0.12+2333135a3-h100x1-bf16-throughput.1.3.655593 displayName: Mistral 7B Instruct H100 BF16 Throughput framework: TensorRT-LLM ngcMetadata: ed4af8b6563348d37f72bfd013be44573a1c88f384ef8fb3eaf0c69e4f235c20: container_url: nvcr.io/nim/mistralai/mistral-7b-instruct-v03:1.1.2 model: mistralai/mistral-7b-instruct-v0.3 release: 1.3.0 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 14GB - profileId: nim/mistralai/mistral-7b-instruct-v0-3:0.12+2333135a3-l40sx1-bf16-throughput.1.3.655593 displayName: Mistral 7B Instruct L40S BF16 Throughput framework: TensorRT-LLM ngcMetadata: 8af967d80ae8f30f4635a59b2140fdc2b38d3004e16e66c9667fa032e56497fd: container_url: nvcr.io/nim/mistralai/mistral-7b-instruct-v03:1.1.2 model: mistralai/mistral-7b-instruct-v0.3 release: 1.3.0 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b5:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 1 - key: GPU DEVICE value: 26b5:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 14GB - profileId: nim/mistralai/mistral-7b-instruct-v0-3:0.12+2333135a3-a10gx2-bf16-throughput.1.3.127462 displayName: Mistral 7B Instruct A10Gx2 BF16 Throughput framework: TensorRT-LLM ngcMetadata: 57beb7b4f94f72519842de3e1b4cda5ae0774271cf433ff56180551e0f15d0c8: container_url: nvcr.io/nim/mistralai/mistral-7b-instruct-v03:1.1.2 model: mistralai/mistral-7b-instruct-v0.3 release: 1.3.0 tags: feat_lora: 'false' gpu: A10G gpu_device: 2237:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: BF16 - key: GPU value: A10G - key: COUNT value: 2 - key: GPU DEVICE value: 2237:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 14GB - profileId: nim/mistralai/mistral-7b-instruct-v0-3:0.12+2333135a3-l40sx2-fp8-latency.1.3.885640 displayName: Mistral 7B Instruct L40Sx2 FP8 Latency framework: TensorRT-LLM ngcMetadata: 138437d95405e4dad69a8cd4dc6126a2b8fc9254a274af83b1fd0b1b01658b55: model: mistralai/mistral-7b-instruct-v0.3 release: 1.3.0 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b5:10de llm_engine: tensorrt_llm pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 2 - key: GPU DEVICE value: 26b9:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 8GB - profileId: nim/mistralai/mistral-7b-instruct-v0-3:0.12+2333135a3-l40sx2-bf16-latency.1.3.655593 displayName: Mistral 7B Instruct L40Sx2 BF16 Latency framework: TensorRT-LLM ngcMetadata: 4c50d586aaa9b9a484d5090213be8ff5db7f5b775aa94b66651eac515108f16c: model: mistralai/mistral-7b-instruct-v0.3 release: 1.3.0 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b5:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 2 - key: GPU DEVICE value: 26b9:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/mistralai/mistral-7b-instruct-v0-3:0.12+2333135a3-h100x2-bf16-latency.1.3.655593 displayName: Mistral 7B Instruct H100x2 BF16 Latency framework: TensorRT-LLM ngcMetadata: 8c27f77dab1986e76b524c755fa5a809f8882517b503e76bfcf8d42b991adc89: model: mistralai/mistral-7b-instruct-v0.3 release: 1.3.0 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/mistralai/mistral-7b-instruct-v0-3:0.12+2333135a3-a100x1-bf16-throughput.1.3.127462 displayName: Mistral 7B Instruct A100 BF16 Throughput framework: TensorRT-LLM ngcMetadata: 9189d008806a9638d4206e6ff94c0b0d9acc2a8861f6de5a49b9d0a5acdcf049: model: mistralai/mistral-7b-instruct-v0.3 release: 1.3.0 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 1 - key: GPU DEVICE value: 20b2:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 15GB - profileId: nim/mistralai/mistral-7b-instruct-v0-3:0.12+2333135a3-a10gx4-bf16-latency.1.3.127462 displayName: Mistral 7B Instruct A10Gx4 BF16 Latency framework: TensorRT-LLM ngcMetadata: 9bccc20c28c1728b59cdbad4b2c1607d3b57388ff266da4477ea8a413ae0fb7d: model: mistralai/mistral-7b-instruct-v0.3 release: 1.3.0 tags: feat_lora: 'false' gpu: A10G gpu_device: 2237:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: BF16 - key: GPU value: A10G - key: COUNT value: 4 - key: GPU DEVICE value: 2237:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 16GB - profileId: nim/mistralai/mistral-7b-instruct-v0-3:0.12+2333135a3-l40sx1-fp8-throughput.1.3.885640 displayName: Mistral 7B Instruct L40S FP8 Throughput framework: TensorRT-LLM ngcMetadata: f34180a7eb689e915c741cda5ea015ac54b134a73b13b0b2865a5a4e44291a85: model: mistralai/mistral-7b-instruct-v0.3 release: 1.3.0 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b5:10de llm_engine: tensorrt_llm pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 1 - key: GPU DEVICE value: 26b5:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 8GB - profileId: nim/mistralai/mistral-7b-instruct-v0-3:0.12+2333135a3-h100x1-fp8-throughput.1.3.885640 displayName: Mistral 7B Instruct H100 FP8 Throughput framework: TensorRT-LLM ngcMetadata: f8b5f71dd66c36c70deac7927cbd98b1c4f78caf1abf01f768be7118e1daa278: model: mistralai/mistral-7b-instruct-v0.3 release: 1.3.0 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 8GB - profileId: nim/mistralai/mistral-7b-instruct-v0-3:0.12+2333135a3-h100x2-fp8-latency.1.3.885640 displayName: Mistral 7B Instruct H100x2 FP8 Latency framework: TensorRT-LLM ngcMetadata: fa55c825306dfc09c9d0e7ef423e897d91fe8334a3da87d284f45f45cbd4c1b0: model: mistralai/mistral-7b-instruct-v0.3 release: 1.3.0 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 8GB labels: - Mistral - Instruct - Large Language Model - TensorRT-LLM - Language Generation - NeMo - NVIDIA Validated config: architectures: - Other modelType: Mistral license: NVIDIA AI Foundation Models Community License - name: Mixtral Instruct displayName: Mixtral Instruct modelHubID: mixtral-instruct category: Language type: NGC description: Sparse Mixture-of-Experts model fine-tuned for instruction following with a 32K token context window. Matches or outperforms much larger dense models on standard benchmarks. modelVariants: - variantId: Mixtral 8x7B Instruct displayName: Mixtral 8x7B Instruct source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/mistralai/containers/mixtral-8x7b-instruct-v01 optimizationProfiles: - profileId: nim/mistralai/mixtral-8x7b-instruct-v01:0.12.0+2333135a3-a10gx8-fp16-throughput.1.3.18301798 displayName: Mixtral 8x7B Instruct A10Gx8 FP16 Throughput framework: TensorRT-LLM ngcMetadata: 03501a01c138dcfc63fc672c20053e3fca8d7bdae1f448165d7bed3f241973cf: model: mistralai/mixtral-8x7b-instruct-v0.1 release: 1.3.0 tags: feat_lora: false gpu: A10G gpu_device: 2237:10de llm_engine: tensorrt_llm pp: '1' precision: fp16 profile: throughput tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: FP16 - key: GPU value: A10G - key: COUNT value: 8 - key: GPU DEVICE value: 2237:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 89GB - profileId: nim/mistralai/mixtral-8x7b-instruct-v01:0.12.0+2333135a3-h100x2-int8wo-throughput.1.3.18301798 displayName: Mixtral 8x7B Instruct H100x2 int8wo Throughput framework: TensorRT-LLM ngcMetadata: 208d53be878cb4d31c9019a80637c54e441e4a4edbee17754d1fc1b0b31b1cc1: model: mistralai/mixtral-8x7b-instruct-v0.1 release: 1.3.0 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: int8wo profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: INT8WO - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 48GB - profileId: nim/mistralai/mixtral-8x7b-instruct-v01:0.12.0+2333135a3-h100x2-fp16-throughput.1.3.18301798 displayName: Mixtral 8x7B Instruct H100x2 FP16 Throughput framework: TensorRT-LLM ngcMetadata: bbaccf5c5f059943db905cfcb4e9f2e4e83f0da3617abd244b693103d13005f4: container_url: nvcr.io/nim/mistralai/mixtral-8x7b-instruct-v01:1.2.1 model: mistralai/mixtral-8x7b-instruct-v0.1 release: 1.3.0 tags: feat_lora: false gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: fp16 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: FP16 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 94GB - profileId: nim/mistralai/mixtral-8x7b-instruct-v01:0.12.0+2333135a3-l40sx4-fp8-throughput.1.3.18301798 displayName: Mixtral 8x7B Instruct L40Sx4 FP8 Throughput framework: TensorRT-LLM ngcMetadata: 4a7fcddcd723f52264e0a9b90b3a17674d1ceb11000aa6dfa50e8a9f1d7c4c8e: model: mistralai/mixtral-8x7b-instruct-v0.1 release: 1.3.0 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b5:10de llm_engine: tensorrt_llm pp: '1' precision: fp8 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 4 - key: GPU DEVICE value: 26b5:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 94GB - profileId: nim/mistralai/mixtral-8x7b-instruct-v01:0.12.0+2333135a3-l40sx4-fp16-throughput.1.3.18301798 displayName: Mixtral 8x7B Instruct L40Sx4 FP16 Throughput framework: TensorRT-LLM ngcMetadata: 536502b5ba23293b7a9bd6dfabd9b93d2d82c8436d0788cc748b28aefd4adf79: model: mistralai/mixtral-8x7b-instruct-v0.1 release: 1.3.0 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b5:10de llm_engine: tensorrt_llm pp: '1' precision: fp16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: FP16 - key: GPU value: L40S - key: COUNT value: 4 - key: GPU DEVICE value: 26b5:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 95GB - profileId: nim/mistralai/mixtral-8x7b-instruct-v01:0.12.0+2333135a3-h100x4-int8wo-latency.1.3.18301798 displayName: Mixtral 8x7B Instruct H100 INT8WO Latency framework: TensorRT-LLM ngcMetadata: 5cf31967505bc7d4e792563c5521545703cee2be36714b6944e0e33adb70409a: model: mistralai/mixtral-8x7b-instruct-v0.1 release: 1.3.0 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: int8wo profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: INT8WO - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 48GB - profileId: nim/mistralai/mixtral-8x7b-instruct-v01:0.12.0+2333135a3-h100x4-fp16-latency.1.3.18301798 displayName: Mixtral 8x7B Instruct H100x4 FP16 Latency framework: TensorRT-LLM ngcMetadata: ed45c32307812aa9b45ef8b3f73d635a4ed8af4ee46ffa09253fc529fbfd55db: model: mistralai/mixtral-8x7b-instruct-v0.1 release: 1.3.0 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: fp16 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: FP16 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 95GB - profileId: nim/mistralai/mixtral-8x7b-instruct-v01:0.12.0+2333135a3-h100x4-fp8-latency.1.3.18301798 displayName: Mixtral 8x7B Instruct H100x4 FP8 Latency framework: TensorRT-LLM ngcMetadata: f255f2c7d6787f8b436aa1a74280ebb1a736fa21ae39fd56aeef92f10f7c9c81: model: mistralai/mixtral-8x7b-instruct-v0.1 release: 1.3.0 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: fp8 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 48GB - variantId: Mixtral 8x22B Instruct displayName: Mixtral 8x22B Instruct source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/mistralai/containers/mixtral-8x22b-instruct-v01 optimizationProfiles: - profileId: nim/mistralai/mixtral-8x22b-instruct-v01:0.10.1+79a76176-h100x8-int8wo-throughput.1.2.2.16140417 displayName: Mixtral 8x22B Instruct H100 int8wo Throughput framework: TensorRT-LLM ngcMetadata: 4ad9a208ce0f8ec41cd6b8681cd0ddf6fbeb406efb3d9baf6847a3fb8bac5863: container_url: nvcr.io/nim/mistralai/mixtral-8x22b-instruct-v01:1.0.0 model: mistralai/mixtral-8x22b-instruct-v0.1 model_type: text_generation release: 1.0.0 tags: feat_lora: false gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: int8wo profile: throughput tp: '8' workspace: !workspace components: - dst: '' src: files: - !name 'README.md' - !name 'checksums.blake3' - !name 'config.json' - !name 'generation_config.json' - !name 'model.safetensors.index.json' - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer.model' - !name 'tokenizer_config.json' repo_id: ngc://nim/mistralai/mixtral-8x22b-instruct-v01:hf-52572b2 - dst: trtllm_engine src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'config.json' - !name 'metadata.json' - !name 'rank0.engine' - !name 'rank1.engine' - !name 'rank2.engine' - !name 'rank3.engine' - !name 'rank4.engine' - !name 'rank5.engine' - !name 'rank6.engine' - !name 'rank7.engine' - !name 'trt_llm_config.yaml' repo_id: ngc://nim/mistralai/mixtral-8x22b-instruct-v01:0.10.1+79a76176-h100x8-int8wo-throughput.1.0.0.16140417 sha: 4ad9a208ce0f8ec41cd6b8681cd0ddf6fbeb406efb3d9baf6847a3fb8bac5863 modelFormat: trt-llm latestVersionSizeInBytes: 144762798586 spec: - key: PROFILE value: Throughput - key: PRECISION value: int8wo - key: GPU value: H100 - key: COUNT value: 8 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.2.2 - key: DOWNLOAD SIZE value: 144GB - profileId: nim/mistralai/mixtral-8x22b-instruct-v01:0.11.1+14957bf8-h100x8-fp16-throughput.1.1.2.17572569 displayName: Mixtral 8x22B Instruct H100 FP16 Throughput framework: TensorRT-LLM ngcMetadata: e44c755ef6628cccb74ccf58af4a6efa039f7e49e07a9dd7a27eb17f6500964e: container_url: nvcr.io/nim/mistralai/mixtral-8x22b-instruct-v01:1.2.2 model: mistralai/mixtral-8x22b-instruct-v0.1 release: 1.2.2 tags: feat_lora: false gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: fp16 profile: throughput tp: '8' workspace: !workspace components: - dst: '' src: files: - !name 'README.md' - !name 'checksums.blake3' - !name 'config.json' - !name 'generation_config.json' - !name 'model.safetensors.index.json' - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer.model' - !name 'tokenizer_config.json' - !name 'tool_use_config.json' repo_id: ngc://nim/mistralai/mixtral-8x22b-instruct-v01:hf-1702b01-tool-calling - dst: trtllm_engine src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'config.json' - !name 'metadata.json' - !name 'rank0.engine' - !name 'rank1.engine' - !name 'rank2.engine' - !name 'rank3.engine' - !name 'rank4.engine' - !name 'rank5.engine' - !name 'rank6.engine' - !name 'rank7.engine' repo_id: ngc://nim/mistralai/mixtral-8x22b-instruct-v01:0.11.1+14957bf8-h100x8-fp16-throughput.1.1.2.17572569 sha: e44c755ef6628cccb74ccf58af4a6efa039f7e49e07a9dd7a27eb17f6500964e modelFormat: trt-llm latestVersionSizeInBytes: 285170977174 spec: - key: PROFILE value: Throughput - key: PRECISION value: FP16 - key: GPU value: H100 - key: COUNT value: 8 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.2.2 - key: DOWNLOAD SIZE value: 285GB labels: - Mistral - Instruct - Large Language Model - TensorRT-LLM - Language Generation - NeMo - NVIDIA Validated config: architectures: - Other modelType: mistral license: NVIDIA AI Foundation Models Community License - name: Deepseek R1 Distill Llama displayName: Deepseek R1 Distill Llama modelHubID: deepseek-r1-distill-llama category: Language type: NGC description: Distilled reasoning model excelling at math, science, and code tasks. Delivers state-of-the-art performance on complex reasoning benchmarks. requireLicense: true licenseAgreements: - label: Use Policy url: https://llama.meta.com/llama3/use-policy/ - label: License Agreement url: https://llama.meta.com/llama3/license/ modelVariants: - variantId: Deepseek R1 Distill Llama 70b source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/deepseek-ai/containers/deepseek-r1-distill-llama-70b optimizationProfiles: - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-70b:l40sx4-throughput-fp8-46u3lvp6ja framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 70B L40Sx4 FP8 Throughput ngcMetadata: 23c28e4a1ad4d963c1504f1a33b45afb65bf61b64b20be1a8ea2c8816ea0fc36: model: deepseek-r1-distill-llama-70b release: 1.5.2 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 4 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-70b:h100x4-latency-fp8-k5tlofelyw framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 70B H100x4 FP8 Latency ngcMetadata: 4696d5c5b44b13bb5e864affcdcfa30ad229390285476315d9921fd0828bda5b: model: deepseek-r1-distill-llama-70b release: 1.5.2 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: fp8 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-70b:h100x8-latency-fp8-xz3eymtuzq framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 70B H100x8 FP8 Latency ngcMetadata: 91f2b7c9e719c0c380ba6c1d6c3e5cad61aaf807730de88fa3b6233a39edeeaa: model: deepseek-r1-distill-llama-70b release: 1.5.2 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: fp8 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 8 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 70GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-70b:h100x2-throughput-fp8-8cx2penaia framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 70B H100x2 FP8 Throughput ngcMetadata: da94a5c34cf665e85813fa49f321f1e87ca12317722b5e65628cf3ed0371897b: model: deepseek-r1-distill-llama-70b release: 1.5.2 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 69GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-70b:h100x4-throughput-bf16-g31fj2uvrw framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 70B H100x4 BF16 Throughput ngcMetadata: e6b8fb8c4c76343b05b9051974593e5bd9110a868770d52e8eb0fe5a3b46dd67: model: deepseek-r1-distill-llama-70b release: 1.5.2 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: throughput tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 138GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-70b:h100x8-latency-bf16-v8q6jmcd9g framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 70B H100x8 BF16 Latency ngcMetadata: f87605b6d8cfc0ca39fad21b4ec580219f3a3be42884d2c7caad9b8ae4b3c1c7: model: deepseek-r1-distill-llama-70b release: 1.5.2 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '8' pp: '1' precision: bf16 profile: latency tp: '8' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 8 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 147GB - variantId: Deepseek R1 Distill Llama 8b source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/deepseek-ai/containers/deepseek-r1-distill-llama-8b optimizationProfiles: - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-8b:l40sx1-throughput-fp8-vbqc0btoqg framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 8B L40Sx1 FP8 Throughput ngcMetadata: d968c663c710e56275088096bc0dcf823560aaf7dca910bfcb41f5056063ab02: model: deepseek-ai/deepseek-r1-distill-llama-8b release: 1.5.2 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 1 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-8b:h100x1-throughput-fp8-d9grrq-lka framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 8B H100x1 FP8 Throughput ngcMetadata: 0bdec027404c16d6ca96e159079082f9630a24a277ff519d0c8fea71007222ec: model: deepseek-ai/deepseek-r1-distill-llama-8b release: 1.5.2 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-8b:h100x2-latency-bf16-7ztok5r0dg framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 8B H100x2 BF16 Latency ngcMetadata: 0ce355335e6c3aec54e49ab53822e628fa1227091d0326da962bcc4f95b5f602: model: deepseek-ai/deepseek-r1-distill-llama-8b release: 1.5.2 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-8b:a10gx4-latency-bf16-aiejrysrlw framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 8B A10Gx4 BF16 Latency ngcMetadata: 1dfac8e12042573dc93536a393902478e1a6a46d1cd742cf0a4251c11f77e253: model: deepseek-ai/deepseek-r1-distill-llama-8b release: 1.5.2 tags: feat_lora: 'false' gpu: A10G gpu_device: 2237:10de llm_engine: tensorrt_llm number_of_gpus: '4' pp: '1' precision: bf16 profile: latency tp: '4' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: A10G - key: COUNT value: 4 - key: GPU DEVICE value: 2237:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 19GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-8b:l40sx2-latency-fp8-fmuoxfbb0q framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 8B L40Sx2 FP8 Latency ngcMetadata: c2d4efce2d553c3aa78109b6d5dff0fd34b86bbb3b765aa8afdf12e9d13e8e83: model: deepseek-ai/deepseek-r1-distill-llama-8b release: 1.5.2 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 2 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-8b:h100x1-throughput-bf16-4jcstzx27q framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 8B H100x1 BF16 Throughput ngcMetadata: 4f6dba657c08280bdb419cbc1c60d265e82731b807ee2ae3c111cb9a91571aa1: model: deepseek-ai/deepseek-r1-distill-llama-8b release: 1.5.2 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 16GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-8b:h100x2-latency-fp8-q8xwzp22aa framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 8B H100x2 FP8 Latency ngcMetadata: 518edac01f731b63676743a1860fe21861d1399b19cb2e584de3d9a6a3ea6d8e: model: deepseek-ai/deepseek-r1-distill-llama-8b release: 1.5.2 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: fp8 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 9GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-8b:l40sx1-throughput-bf16-yvbnwvfzew framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 8B L40Sx1 BF16 Throughput ngcMetadata: 9bc8e8aa12847674fa2840b9c03cbdb0246d7f144a5257510fd53eacc2a9d62f: model: deepseek-ai/deepseek-r1-distill-llama-8b release: 1.5.2 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 1 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 16GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-8b:a100x1-throughput-bf16-iq9maz9nkw framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 8B A100x1 BF16 Throughput ngcMetadata: c959aa89b69ad9295ccc99a34546819d16bb0e2566a6cfed0985eecf37bcc14b: model: deepseek-ai/deepseek-r1-distill-llama-8b release: 1.5.2 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 1 - key: GPU DEVICE value: 20B2:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 16GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-8b:l40sx2-latency-bf16-tlmx3sgrdw framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 8B L40Sx2 BF16 Latency ngcMetadata: 20d6bb61a1ee5160c0baed3721f8b580525a0aaaaa3b1333e9a882d4c61b1ed7: model: deepseek-ai/deepseek-r1-distill-llama-8b release: 1.5.2 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: latency tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: LATENCY - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 2 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 17GB - profileId: nim/deepseek-ai/deepseek-r1-distill-llama-8b:a10gx2-throughput-bf16-uv8ptkf8-g framework: TensorRT-LLM displayName: Deepseek R1 Distill Llama 8B A10Gx2 BF16 Throughput ngcMetadata: edbb37d3ef94a5cc38919ab86694b835307c0668ca6d41ea746796b34ced78f1: model: deepseek-ai/deepseek-r1-distill-llama-8b release: 1.5.2 tags: feat_lora: 'false' gpu: A10G gpu_device: 2237:10de llm_engine: tensorrt_llm number_of_gpus: '2' pp: '1' precision: bf16 profile: throughput tp: '2' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A10G - key: COUNT value: 2 - key: GPU DEVICE value: 2237:10DE - key: NIM VERSION value: 1.5.2 - key: DOWNLOAD SIZE value: 17GB labels: - Deepseek - Distill - Llama - Meta - Chat - Large Language Model - NVIDIA Validated config: architectures: - Other modelType: llama license: NVIDIA AI Foundation Models Community License - name: Llama 3.2 Instruct displayName: Llama 3.2 Instruct modelHubID: meta/llama-3.2-instruct category: Language type: NGC description: Lightweight multilingual models optimized for dialogue, agentic retrieval, and summarization. Strong benchmark performance in a compact footprint. requireLicense: true licenseAgreements: - label: Use Policy url: https://llama.meta.com/llama3_2/use-policy/ - label: License Agreement url: https://llama.meta.com/llama3_2/license/ modelVariants: - variantId: Llama 3.2 1B Instruct source: URL: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/llama-3_2-1b-instruct optimizationProfiles: - profileId: nim/meta/llama-3.2-1b-instruct:hf-e9f8eff-nim1.5+ framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct GH200_480GBx1 BF16 Throughput ngcMetadata: f7f74ecd523cd63065a50016a8786a893b9b1efe0d313bc5bcc54682f56e55fe: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: GH200_480GB gpu_device: 2342:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: GH200_480GB - key: COUNT value: 1 - key: GPU DEVICE value: 2342:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 3GB - profileId: nim/meta/llama-3.2-1b-instruct:hf-e9f8eff-nim1.5+ framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct GH200_480GBx1 FP8 Throughput ngcMetadata: f49b49f3d90159a594def51efd8595f1d618e288bca2721fe08e786a1ac67d04: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: GH200_480GB gpu_device: 2342:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: GH200_480GB - key: COUNT value: 1 - key: GPU DEVICE value: 2342:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 3GB - profileId: nim/meta/llama-3.2-1b-instruct:a100x1-throughput-bf16-a2zlotpozq framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct A100x1 BF16 Throughput ngcMetadata: 222d1729a785201e8a021b226d74d227d01418c41b556283ee1bdbf0a818bd94: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 1 - key: GPU DEVICE value: 20B2:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 3GB - profileId: nim/meta/llama-3.2-1b-instruct:hf-e9f8eff-nim1.5+ framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct H100_NVLx1 BF16 Throughput ngcMetadata: 25b5e251d366671a4011eaada9872ad1d02b48acc33aa0637853a3e3c3caa516: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100_NVL - key: COUNT value: 1 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 3GB - profileId: nim/meta/llama-3.2-1b-instruct:h200x1-throughput-bf16-pkk6mlb47w framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct H200x1 BF16 Throughput ngcMetadata: 434e8d336fa23cbe151748d32b71e196d69f20d319ee8b59852a1ca31a48d311: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H200 - key: COUNT value: 1 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 3GB - profileId: nim/meta/llama-3.2-1b-instruct:hf-e9f8eff-nim1.5+ framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct H100_NVLx1 FP8 Throughput ngcMetadata: 5811750e70b7e9f340f4d670c72fcbd5282e254aeb31f62fd4f937cfb9361007: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 1 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 3GB - profileId: nim/meta/llama-3.2-1b-instruct:a10gx1-throughput-bf16-orfmmsdx5a framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct A10Gx1 BF16 Throughput ngcMetadata: 74bfd8b2df5eafe452a9887637eef4820779fb4e1edb72a4a7a2a1a2d1e6480b: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: A10G gpu_device: 2237:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A10G - key: COUNT value: 1 - key: GPU DEVICE value: 2237:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 4GB - profileId: nim/meta/llama-3.2-1b-instruct:h100x1-throughput-fp8-u882rrmxeg framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct H100x1 FP8 Throughput ngcMetadata: 7b508014e846234db3cabe5c9f38568b4ee96694b60600a0b71c621dc70cacf3: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 2GB - profileId: nim/meta/llama-3.2-1b-instruct:b200x1-throughput-fp8-qhbtqh0mwg framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct B200x1 FP8 Throughput ngcMetadata: 8b87146e39b0305ae1d73bc053564d1b4b4c565f81aa5abe3e84385544ca9b60: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 1 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 2GB - profileId: nim/meta/llama-3.2-1b-instruct:b200x1-throughput-bf16-mficrdoqfw framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct B200x1 BF16 Throughput ngcMetadata: a4c63a91bccf635b570ddb6d14eeb6e7d0acb2389712892b08d21fad2ceaee38: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: B200 - key: COUNT value: 1 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 3GB - profileId: nim/meta/llama-3.2-1b-instruct:l40sx1-throughput-bf16-vpk2vn8hrg framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct L40Sx1 BF16 Throughput ngcMetadata: ac5071bbd91efcc71dc486fcd5210779570868b3b8328b4abf7a408a58b5e57c: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 1 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 3GB - profileId: nim/meta/llama-3.2-1b-instruct:l40sx1-throughput-fp8-dngq15ocbg framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct L40Sx1 FP8 Throughput ngcMetadata: ad17776f4619854fccd50354f31132a558a1ca619930698fd184d6ccf5fe3c99: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 1 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 2GB - profileId: nim/meta/llama-3.2-1b-instruct:h200x1-throughput-fp8-ajzq-idegq framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct H200x1 FP8 Throughput ngcMetadata: af876a179190d1832143f8b4f4a71f640f3df07b0503259cedee3e3a8363aa96: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 1 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 2GB - profileId: nim/meta/llama-3.2-1b-instruct:hf-e9f8eff-nim1.5+ framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct A100_SXM4_40GBx1 BF16 Throughput ngcMetadata: c6821c013c559912c37e61d7b954c5ca8fe07dda76d8bea0f4a52320e0a54427: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: A100_SXM4_40GB gpu_device: 20b0:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: A100_SXM4_40GB - key: COUNT value: 1 - key: GPU DEVICE value: 20B0:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 3GB - profileId: nim/meta/llama-3.2-1b-instruct:h100x1-throughput-bf16-ns5-dcqtwq framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct H100x1 BF16 Throughput ngcMetadata: e7dbd9a8ce6270d2ec649a0fecbcae9b5336566113525f20aee3809ba5e63856: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 3GB - profileId: nim/meta/llama-3.2-1b-instruct:hf-e9f8eff-nim1.5+ framework: TensorRT-LLM displayName: Llama 3.2 1B Instruct BF16 ngcMetadata: ac34857f8dcbd174ad524974248f2faf271bd2a0355643b2cf1490d0fe7787c2: model: meta/llama-3.2-1b-instruct release: 1.8.6 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '1' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 1 - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 3GB - variantId: Llama 3.2 3B Instruct source: URL: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/llama-3_2-3b-instruct optimizationProfiles: - profileId: nim/meta/llama-3.2-3b-instruct:a100x1-throughput-fp16-6m5dnxceua framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct A100x1 FP16 Throughput ngcMetadata: 34e8e5d0b7ac366e5247473de1ac4a6620fa6000e13f1a256735129d0db23761: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP16 - key: GPU value: A100 - key: COUNT value: 1 - key: GPU DEVICE value: 20B2:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 7GB - profileId: nim/meta/llama-3.2-3b-instruct:hf-392a143-0508-tool-use-v2 framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct H100_NVLx1 FP16 Throughput ngcMetadata: 3c8257b0990b7e2a5c6a2ba21a4e8e4dac6bf6e43320f41281a0b5f6d6c0228d: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP16 - key: GPU value: H100_NVL - key: COUNT value: 1 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 6GB - profileId: nim/meta/llama-3.2-3b-instruct:hf-392a143-0508-tool-use-v2 framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct H100_NVLx1 FP8 Throughput ngcMetadata: 5811750e70b7e9f340f4d670c72fcbd5282e254aeb31f62fd4f937cfb9361007: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100_NVL gpu_device: 2321:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100_NVL - key: COUNT value: 1 - key: GPU DEVICE value: 2321:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 6GB - profileId: nim/meta/llama-3.2-3b-instruct:h200x1-throughput-fp16-ylknhtupda framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct H200x1 FP16 Throughput ngcMetadata: 5fbbb278b341858164183716ee8e04eb41e0d5283d6faedfba8aed4180535b53: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP16 - key: GPU value: H200 - key: COUNT value: 1 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 7GB - profileId: nim/meta/llama-3.2-3b-instruct:h100x1-throughput-fp8-cehb8f0o2q framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct H100x1 FP8 Throughput ngcMetadata: 7b508014e846234db3cabe5c9f38568b4ee96694b60600a0b71c621dc70cacf3: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 5GB - profileId: nim/meta/llama-3.2-3b-instruct:b200x1-throughput-fp8-ecjbunkvug framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct B200x1 FP8 Throughput ngcMetadata: 8b87146e39b0305ae1d73bc053564d1b4b4c565f81aa5abe3e84385544ca9b60: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: B200 - key: COUNT value: 1 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 5GB - profileId: nim/meta/llama-3.2-3b-instruct:hf-392a143-0508-tool-use-v2 framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct A100_SXM4_40GBx1 FP16 Throughput ngcMetadata: 8d4a2ffc83d859ba5a1c31912cb3e555f7c994111987b3e1101baae915371bf1: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: A100_SXM4_40GB gpu_device: 20b0:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP16 - key: GPU value: A100_SXM4_40GB - key: COUNT value: 1 - key: GPU DEVICE value: 20B0:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 6GB - profileId: nim/meta/llama-3.2-3b-instruct:a10gx1-throughput-fp16-ygf1h6yobw framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct A10Gx1 FP16 Throughput ngcMetadata: 8ffb5cb4d82407de65b02eb9749fd1fa84084137e05593706f33466259df9f6b: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: A10G gpu_device: 2237:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP16 - key: GPU value: A10G - key: COUNT value: 1 - key: GPU DEVICE value: 2237:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 7GB - profileId: nim/meta/llama-3.2-3b-instruct:b200x1-throughput-bf16-qk5bstzzsa framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct B200x1 BF16 Throughput ngcMetadata: a4c63a91bccf635b570ddb6d14eeb6e7d0acb2389712892b08d21fad2ceaee38: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: B200 gpu_device: 2901:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: B200 - key: COUNT value: 1 - key: GPU DEVICE value: 2901:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 7GB - profileId: nim/meta/llama-3.2-3b-instruct:l40sx1-throughput-fp8-8ujyylyqww framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct L40Sx1 FP8 Throughput ngcMetadata: ad17776f4619854fccd50354f31132a558a1ca619930698fd184d6ccf5fe3c99: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 1 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 5GB - profileId: nim/meta/llama-3.2-3b-instruct:h200x1-throughput-fp8-nj2npsfa-w framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct H200x1 FP8 Throughput ngcMetadata: af876a179190d1832143f8b4f4a71f640f3df07b0503259cedee3e3a8363aa96: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H200 gpu_device: 2335:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: H200 - key: COUNT value: 1 - key: GPU DEVICE value: 2335:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 5GB - profileId: nim/meta/llama-3.2-3b-instruct:h100x1-throughput-fp16-m2qe4ioxxq framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct H100x1 FP16 Throughput ngcMetadata: d6e3e406c0b0eaeb76f21ee5c5a545edfe8031d3cfa302030f690f38126b0ab8: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP16 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 7GB - profileId: nim/meta/llama-3.2-3b-instruct:l40sx1-throughput-fp16-utfz33y-fg framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct L40Sx1 FP16 Throughput ngcMetadata: e9674f56ec90347526f8edf1cf407cce1441972ef277dee7a15f5c768112d2bd: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: L40S gpu_device: 26b9:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP16 - key: GPU value: L40S - key: COUNT value: 1 - key: GPU DEVICE value: 26B9:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 7GB - profileId: nim/meta/llama-3.2-3b-instruct:hf-392a143-0508-tool-use-v2 framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct GH200_480GBx1 FP8 Throughput ngcMetadata: f49b49f3d90159a594def51efd8595f1d618e288bca2721fe08e786a1ac67d04: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: GH200_480GB gpu_device: 2342:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: fp8 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: FP8 - key: GPU value: GH200_480GB - key: COUNT value: 1 - key: GPU DEVICE value: 2342:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 6GB - profileId: nim/meta/llama-3.2-3b-instruct:hf-392a143-0508-tool-use-v2 framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct GH200_480GBx1 BF16 Throughput ngcMetadata: f7f74ecd523cd63065a50016a8786a893b9b1efe0d313bc5bcc54682f56e55fe: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' gpu: GH200_480GB gpu_device: 2342:10de llm_engine: tensorrt_llm number_of_gpus: '1' pp: '1' precision: bf16 profile: throughput tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: THROUGHPUT - key: PRECISION value: BF16 - key: GPU value: GH200_480GB - key: COUNT value: 1 - key: GPU DEVICE value: 2342:10DE - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 6GB - profileId: nim/meta/llama-3.2-3b-instruct:hf-392a143-0508-tool-use-v2 framework: TensorRT-LLM displayName: Llama 3.2 3B Instruct BF16 ngcMetadata: ac34857f8dcbd174ad524974248f2faf271bd2a0355643b2cf1490d0fe7787c2: model: meta/llama-3.2-3b-instruct release: 1.8.6 tags: feat_lora: 'false' llm_engine: tensorrt_llm pp: '1' precision: bf16 tp: '1' trtllm_buildable: 'true' modelFormat: trt-llm spec: - key: PRECISION value: BF16 - key: COUNT value: 1 - key: NIM VERSION value: 1.8.6 - key: DOWNLOAD SIZE value: 6GB labels: - Llama - Meta - Multilingual Large Language Model - NVIDIA Validated config: architectures: - Other modelType: llama license: NVIDIA AI Foundation Models Community License - name: NeMo Retriever-Parse displayName: Nemoretriever Parse modelHubID: nemoretriever-parse category: Retrieval type: NGC description: Document text extraction model that converts document images into structured text with bounding boxes and semantic classes. Improves retrieval accuracy and supports LLM training data generation. requireLicense: true licenseAgreements: - label: Use Policy url: https://llama.meta.com/llama3/use-policy/ - label: License Agreement url: https://llama.meta.com/llama3/license/ modelVariants: - variantId: nemoretriever-parse:1.2.0 source: URL: https://build.nvidia.com/nvidia/nemoretriever-parse optimizationProfiles: - profileId: nim/nvidia/nemoretriever-parse:a100x1-throughput-bf16-e9wjao-enw framework: TensorRT-LLM displayName: nemoretriever-parse A100 BF16 Throughput ngcMetadata: 19c68819d9428cfa494e977f4d2be6378215a8f610cce9bdfc0aa3cdd7d66aa9: model: nvidia/nemoretriever-parse release: 1.2.0 tags: gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm pp: '1' profile: throughput precision: bf16 tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 1 - key: GPU DEVICE value: 20b2:10de - key: NIM VERSION value: 1.2.0 - key: DOWNLOAD SIZE value: 600MB - profileId: nim/nvidia/nemoretriever-parse:h100x1-throughput-bf16-2apiazbpma framework: TensorRT-LLM displayName: nemoretriever-parse H100 BF16 Throughput ngcMetadata: 8db6dcd816ca1ce8d07e72d8b9c4682120b3c50799422361e35b4ab87820efd6: model: nvidia/nemoretriever-parse release: 1.2.0 tags: gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' profile: throughput precision: bf16 tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.2.0 - key: DOWNLOAD SIZE value: 600MB - profileId: nim/nvidia/nemoretriever-parse:l40sx1-throughput-bf16-r98ogb1a1a framework: TensorRT-LLM displayName: nemoretriever-parse L40S BF16 Throughput ngcMetadata: 00c8a43783e7acf3d59a0d773cd78d3d29eaa71fa4412af7af2fbaf20e196a8b: model: nvidia/nemoretriever-parse release: 1.2.0 tags: gpu: L40S gpu_device: 26b5:10de llm_engine: tensorrt_llm pp: '1' profile: throughput precision: bf16 tp: '1' modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 1 - key: GPU DEVICE value: 26b5:10de - key: NIM VERSION value: 1.2.0 - key: DOWNLOAD SIZE value: 600MB labels: - NeMo - Text Extraction - Large Language Model - NVIDIA Validated config: architectures: - Other modelType: llama license: NVIDIA AI Foundation Models Community License - name: Nemoretriever Graphic Elements V1 displayName: Nemoretriever Graphic Elements V1 modelHubID: nemoretriever-graphic-elements-v1 category: Retrieval type: NGC description: Specialized object detection model that extracts elements from charts and graphs — titles, axis labels, legends, and data annotations. Purpose-built for document understanding pipelines. requireLicense: true licenseAgreements: - label: Use Policy url: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-community-models-license/ - label: License Agreement url: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/ modelVariants: - variantId: Nemoretriever Graphic Elements V1 source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/nemoretriever-graphic-elements-v1 optimizationProfiles: - profileId: nim/nvidia/nemoretriever-graphic-elements-v1:a100x1-trt-fp16-7lfmwoem-q framework: TensorRT-LLM displayName: Nemoretriever Graphic Elements V1 A100-SXM4-80GBx1 FP16 ngcMetadata: 26c97e9919dae5405145446f00e0189f615ce682526ec9b6da88b5138ff8097d: model: nvidia/nemoretriever-graphic-elements-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: A100-SXM4-80GB model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A100-SXM4-80GB - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-graphic-elements-v1:h100-nvlx1-trt-fp16-pqwy1dxfxg framework: TensorRT-LLM displayName: Nemoretriever Graphic Elements V1 H100-NVLx1 FP16 ngcMetadata: 2e7da4417b1c44978e00f67823ae7ec885edb310e34c2c18464fbd6cad345065: model: nvidia/nemoretriever-graphic-elements-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: H100-NVL model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: H100-NVL - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-graphic-elements-v1:h100x1-trt-fp16-tcgzpfpbwq framework: TensorRT-LLM displayName: Nemoretriever Graphic Elements V1 H100-HBM3-80GBx1 FP16 ngcMetadata: 6097695b532c9abe549de9918de6b4702eda625f27b508acd7b7dcc04f38ebe1: model: nvidia/nemoretriever-graphic-elements-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: H100-HBM3-80GB model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: H100-HBM3-80GB - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-graphic-elements-v1:a10gx1-trt-fp16-ge6bpqew8g framework: TensorRT-LLM displayName: Nemoretriever Graphic Elements V1 A10Gx1 FP16 ngcMetadata: 859ace730c899fb7b8362fe773639da57544f87584f9ed138089e85665653972: model: nvidia/nemoretriever-graphic-elements-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: A10G model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A10G - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-graphic-elements-v1:l40sx1-trt-fp16-ltjamioamw framework: TensorRT-LLM displayName: Nemoretriever Graphic Elements V1 L40Sx1 FP16 ngcMetadata: 90699b066c264c9533628aeb4f1814ef51e0f2f021540e3ae77181f2ef9ce9ed: model: nvidia/nemoretriever-graphic-elements-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: L40S model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: L40S - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-graphic-elements-v1:b200x1-trt-fp16-4kosyy1fig framework: TensorRT-LLM displayName: Nemoretriever Graphic Elements V1 B200x1 FP16 ngcMetadata: a7d7cc7f7236b793a7722a3f0777b0cf3a989cc3c2c34d3e66a392329e1530e7: model: nvidia/nemoretriever-graphic-elements-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: B200 model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: B200 - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-graphic-elements-v1:l4x1-trt-fp16-lhpdyawoug framework: TensorRT-LLM displayName: Nemoretriever Graphic Elements V1 L4x1 FP16 ngcMetadata: ed0b8106aedfc536be363b6f1f0901bd4cd371ef22e640ac03fa7f4e3ed71647: model: nvidia/nemoretriever-graphic-elements-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: L4 model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: L4 - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-graphic-elements-v1:2_ONNX_FP16_1024 framework: ONNX displayName: Nemoretriever Graphic Elements V1 ONNX FP16 ngcMetadata: edc693c6fccd68d266622eace04225421e353d7ce31e3b207afc5ff35124127b: model: nvidia/nemoretriever-graphic-elements-v1 release: 1.3.0 tags: backend: triton model_type: onnx precision: fp16 tp: '1' modelFormat: onnx spec: - key: PRECISION value: FP16 - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: ONNX - profileId: nim/nvidia/nemoretriever-graphic-elements-v1:a100x1-trt-fp16-7lfmwoem-q framework: TensorRT-LLM displayName: Nemoretriever Graphic Elements V1 A100-SXM4-40GBx1 FP16 ngcMetadata: f93ae043aafc696a85fc58461c074397d39ec747651ca996ae470222f93b4e62: model: nvidia/nemoretriever-graphic-elements-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: A100-SXM4-40GB model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A100-SXM4-40GB - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT labels: - signed images - NVIDIA AI Enterprise Supported - NVIDIA NIM - NSPECT-7OBP-T77C config: architectures: - Other modelType: NGC license: NVIDIA AI Foundation Models Community License - name: Nemoretriever Page Elements V2 displayName: Nemoretriever Page Elements V2 modelHubID: nemoretriever-page-elements-v2 category: Retrieval type: NGC description: Object detection model for identifying tables, charts, infographics, and titles within document pages. Improved accuracy and broader element coverage over its predecessor. requireLicense: true licenseAgreements: - label: Use Policy url: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-community-models-license/ - label: License Agreement url: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/ modelVariants: - variantId: Nemoretriever Page Elements V2 source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/nemoretriever-page-elements-v2 optimizationProfiles: - profileId: nim/nvidia/nemoretriever-page-elements-v2:a100x1-trt-fp16-vwtgi2gdbg framework: TensorRT-LLM displayName: Nemoretriever Page Elements V2 A100-SXM4-80GBx1 FP16 ngcMetadata: 26c97e9919dae5405145446f00e0189f615ce682526ec9b6da88b5138ff8097d: model: nvidia/nemoretriever-page-elements-v2 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: A100-SXM4-80GB model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A100-SXM4-80GB - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-page-elements-v2:h100x1-trt-fp16-gu9grcql-w framework: TensorRT-LLM displayName: Nemoretriever Page Elements V2 H100-NVLx1 FP16 ngcMetadata: 2e7da4417b1c44978e00f67823ae7ec885edb310e34c2c18464fbd6cad345065: model: nvidia/nemoretriever-page-elements-v2 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: H100-NVL model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: H100-NVL - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-page-elements-v2:h100x1-trt-fp16-gu9grcql-w framework: TensorRT-LLM displayName: Nemoretriever Page Elements V2 H100-HBM3-80GBx1 FP16 ngcMetadata: 6097695b532c9abe549de9918de6b4702eda625f27b508acd7b7dcc04f38ebe1: model: nvidia/nemoretriever-page-elements-v2 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: H100-HBM3-80GB model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: H100-HBM3-80GB - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-page-elements-v2:a10gx1-trt-fp16-xrpnjisxaw framework: TensorRT-LLM displayName: Nemoretriever Page Elements V2 A10Gx1 FP16 ngcMetadata: 859ace730c899fb7b8362fe773639da57544f87584f9ed138089e85665653972: model: nvidia/nemoretriever-page-elements-v2 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: A10G model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A10G - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-page-elements-v2:l40sx1-trt-fp16-ckx59ghtpa framework: TensorRT-LLM displayName: Nemoretriever Page Elements V2 L40Sx1 FP16 ngcMetadata: 90699b066c264c9533628aeb4f1814ef51e0f2f021540e3ae77181f2ef9ce9ed: model: nvidia/nemoretriever-page-elements-v2 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: L40S model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: L40S - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-page-elements-v2:b200x1-trt-fp16-b0owuhri1a framework: TensorRT-LLM displayName: Nemoretriever Page Elements V2 B200x1 FP16 ngcMetadata: a7d7cc7f7236b793a7722a3f0777b0cf3a989cc3c2c34d3e66a392329e1530e7: model: nvidia/nemoretriever-page-elements-v2 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: B200 model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: B200 - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-page-elements-v2:l4x1-trt-fp16-z-e8x-oorg framework: TensorRT-LLM displayName: Nemoretriever Page Elements V2 L4x1 FP16 ngcMetadata: ed0b8106aedfc536be363b6f1f0901bd4cd371ef22e640ac03fa7f4e3ed71647: model: nvidia/nemoretriever-page-elements-v2 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: L4 model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: L4 - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-page-elements-v2:a100x1-onnx-fp16-wagmq6-x1q framework: ONNX displayName: Nemoretriever Page Elements V2 ONNX FP16 ngcMetadata: edc693c6fccd68d266622eace04225421e353d7ce31e3b207afc5ff35124127b: model: nvidia/nemoretriever-page-elements-v2 release: 1.3.0 tags: backend: triton model_type: onnx precision: fp16 tp: '1' modelFormat: onnx spec: - key: PRECISION value: FP16 - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: ONNX - profileId: nim/nvidia/nemoretriever-page-elements-v2:a100x1-trt-fp16-vwtgi2gdbg framework: TensorRT-LLM displayName: Nemoretriever Page Elements V2 A100-SXM4-40GBx1 FP16 ngcMetadata: f93ae043aafc696a85fc58461c074397d39ec747651ca996ae470222f93b4e62: model: nvidia/nemoretriever-page-elements-v2 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: A100-SXM4-40GB model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A100-SXM4-40GB - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT labels: - signed images - NSPECT-7OBP-T77C - NVIDIA AI Enterprise Supported - NVIDIA NIM config: architectures: - Other modelType: NIM license: NVIDIA AI Foundation Models Community License - name: Nemoretriever Table Structure V1 displayName: Nemoretriever Table Structure V1 modelHubID: nemoretriever-table-structure-v1 category: Retrieval type: NGC description: Object detection model that identifies the internal structure of complex tables, delineating individual cells — including merged cells — rows, and columns. requireLicense: true licenseAgreements: - label: Use Policy url: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-community-models-license/ - label: License Agreement url: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/ modelVariants: - variantId: Nemoretriever Table Structure V1 source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/nemoretriever-table-structure-v1 optimizationProfiles: - profileId: nim/nvidia/nemoretriever-table-structure-v1:a100x1-trt-fp16-sdugvfnmmg framework: TensorRT-LLM displayName: Nemoretriever Table Structure V1 A100-SXM4-80GBx1 FP16 ngcMetadata: 26c97e9919dae5405145446f00e0189f615ce682526ec9b6da88b5138ff8097d: model: nvidia/nemoretriever-table-structure-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: A100-SXM4-80GB model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A100-SXM4-80GB - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-table-structure-v1:h100x1-trt-fp16-ujymwllaaq framework: TensorRT-LLM displayName: Nemoretriever Table Structure V1 H100-NVLx1 FP16 ngcMetadata: 2e7da4417b1c44978e00f67823ae7ec885edb310e34c2c18464fbd6cad345065: model: nvidia/nemoretriever-table-structure-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: H100-NVL model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: H100-NVL - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-table-structure-v1:h100x1-trt-fp16-ujymwllaaq framework: TensorRT-LLM displayName: Nemoretriever Table Structure V1 H100-HBM3-80GBx1 FP16 ngcMetadata: 6097695b532c9abe549de9918de6b4702eda625f27b508acd7b7dcc04f38ebe1: model: nvidia/nemoretriever-table-structure-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: H100-HBM3-80GB model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: H100-HBM3-80GB - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-table-structure-v1:a10gx1-trt-fp16-rxf-vkobqa framework: TensorRT-LLM displayName: Nemoretriever Table Structure V1 A10Gx1 FP16 ngcMetadata: 859ace730c899fb7b8362fe773639da57544f87584f9ed138089e85665653972: model: nvidia/nemoretriever-table-structure-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: A10G model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A10G - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-table-structure-v1:l40sx1-trt-fp16-s31zox26qg framework: TensorRT-LLM displayName: Nemoretriever Table Structure V1 L40Sx1 FP16 ngcMetadata: 90699b066c264c9533628aeb4f1814ef51e0f2f021540e3ae77181f2ef9ce9ed: model: nvidia/nemoretriever-table-structure-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: L40S model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: L40S - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-table-structure-v1:b200x1-trt-fp16-gqzy1y0hua framework: TensorRT-LLM displayName: Nemoretriever Table Structure V1 B200x1 FP16 ngcMetadata: a7d7cc7f7236b793a7722a3f0777b0cf3a989cc3c2c34d3e66a392329e1530e7: model: nvidia/nemoretriever-table-structure-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: B200 model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: B200 - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-table-structure-v1:l4x1-trt-fp16-5m0higxepq framework: TensorRT-LLM displayName: Nemoretriever Table Structure V1 L4x1 FP16 ngcMetadata: ed0b8106aedfc536be363b6f1f0901bd4cd371ef22e640ac03fa7f4e3ed71647: model: nvidia/nemoretriever-table-structure-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: L4 model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: L4 - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/nemoretriever-table-structure-v1:a100x1-onnx-fp16-l8hnwsbr3g framework: ONNX displayName: Nemoretriever Table Structure V1 ONNX FP16 ngcMetadata: edc693c6fccd68d266622eace04225421e353d7ce31e3b207afc5ff35124127b: model: nvidia/nemoretriever-table-structure-v1 release: 1.3.0 tags: backend: triton model_type: onnx precision: fp16 tp: '1' modelFormat: onnx spec: - key: PRECISION value: FP16 - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: ONNX - profileId: nim/nvidia/nemoretriever-table-structure-v1:a100x1-trt-fp16-sdugvfnmmg framework: TensorRT-LLM displayName: Nemoretriever Table Structure V1 A100-SXM4-40GBx1 FP16 ngcMetadata: f93ae043aafc696a85fc58461c074397d39ec747651ca996ae470222f93b4e62: model: nvidia/nemoretriever-table-structure-v1 release: 1.3.0 tags: backend: triton batch_size: '32' gpu: A100-SXM4-40GB model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A100-SXM4-40GB - key: COUNT value: 1 - key: NIM VERSION value: 1.3.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT labels: - signed images - NSPECT-7OBP-T77C - NVIDIA AI Enterprise Supported - NVIDIA NIM config: architectures: - Other modelType: NIM license: NVIDIA AI Foundation Models Community License - name: PaddleOCR displayName: PaddleOCR modelHubID: paddleocr category: Retrieval type: NGC description: Ultra-lightweight OCR system developed by Baidu with broad support for diverse OCR algorithms. Covers the full pipeline from text detection through recognition across document types. requireLicense: true licenseAgreements: - label: Use Policy url: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-community-models-license/ - label: License Agreement url: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/ modelVariants: - variantId: PaddleOCR source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/baidu/containers/paddleocr optimizationProfiles: - profileId: nim/baidu/paddleocr:2_TRT_python_2 framework: TensorRT-LLM displayName: Paddleocr A100-SXM4-80GBx1 FP16 ngcMetadata: 26c97e9919dae5405145446f00e0189f615ce682526ec9b6da88b5138ff8097d: model: baidu/paddleocr release: 1.4.0 tags: backend: triton batch_size: '32' gpu: A100-SXM4-80GB model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A100-SXM4-80GB - key: COUNT value: 1 - key: NIM VERSION value: 1.4.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/baidu/paddleocr:2_TRT_python_2 framework: TensorRT-LLM displayName: Paddleocr H100-NVLx1 FP16 ngcMetadata: 2e7da4417b1c44978e00f67823ae7ec885edb310e34c2c18464fbd6cad345065: model: baidu/paddleocr release: 1.4.0 tags: backend: triton batch_size: '32' gpu: H100-NVL model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: H100-NVL - key: COUNT value: 1 - key: NIM VERSION value: 1.4.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/baidu/paddleocr:2_TRT_python_2 framework: TensorRT-LLM displayName: Paddleocr H100-HBM3-80GBx1 FP16 ngcMetadata: 6097695b532c9abe549de9918de6b4702eda625f27b508acd7b7dcc04f38ebe1: model: baidu/paddleocr release: 1.4.0 tags: backend: triton batch_size: '32' gpu: H100-HBM3-80GB model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: H100-HBM3-80GB - key: COUNT value: 1 - key: NIM VERSION value: 1.4.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/baidu/paddleocr:a10gx1-trt-fp16-ijpjeptpna framework: TensorRT-LLM displayName: Paddleocr A10Gx1 FP16 ngcMetadata: 859ace730c899fb7b8362fe773639da57544f87584f9ed138089e85665653972: model: baidu/paddleocr release: 1.4.0 tags: backend: triton batch_size: '32' gpu: A10G model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A10G - key: COUNT value: 1 - key: NIM VERSION value: 1.4.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/baidu/paddleocr:l40sx1-trt-fp16-evboykuf0g framework: TensorRT-LLM displayName: Paddleocr L40Sx1 FP16 ngcMetadata: 90699b066c264c9533628aeb4f1814ef51e0f2f021540e3ae77181f2ef9ce9ed: model: baidu/paddleocr release: 1.4.0 tags: backend: triton batch_size: '32' gpu: L40S model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: L40S - key: COUNT value: 1 - key: NIM VERSION value: 1.4.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/baidu/paddleocr:b200x1-trt-fp16-itomrwaucq framework: TensorRT-LLM displayName: Paddleocr B200x1 FP16 ngcMetadata: a7d7cc7f7236b793a7722a3f0777b0cf3a989cc3c2c34d3e66a392329e1530e7: model: baidu/paddleocr release: 1.4.0 tags: backend: triton batch_size: '32' gpu: B200 model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: B200 - key: COUNT value: 1 - key: NIM VERSION value: 1.4.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/baidu/paddleocr:l4x1-trt-fp16-fswetrkejq framework: TensorRT-LLM displayName: Paddleocr L4x1 FP16 ngcMetadata: ed0b8106aedfc536be363b6f1f0901bd4cd371ef22e640ac03fa7f4e3ed71647: model: baidu/paddleocr release: 1.4.0 tags: backend: triton batch_size: '32' gpu: L4 model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: L4 - key: COUNT value: 1 - key: NIM VERSION value: 1.4.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT - profileId: nim/baidu/paddleocr:4_ONNX_python_2 framework: ONNX displayName: Paddleocr ONNX FP16 ngcMetadata: edc693c6fccd68d266622eace04225421e353d7ce31e3b207afc5ff35124127b: model: baidu/paddleocr release: 1.4.0 tags: backend: triton model_type: onnx precision: fp16 tp: '1' modelFormat: onnx spec: - key: PRECISION value: FP16 - key: COUNT value: 1 - key: NIM VERSION value: 1.4.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: ONNX - profileId: nim/baidu/paddleocr:2_TRT_python_2 framework: TensorRT-LLM displayName: Paddleocr A100-SXM4-40GBx1 FP16 ngcMetadata: f93ae043aafc696a85fc58461c074397d39ec747651ca996ae470222f93b4e62: model: baidu/paddleocr release: 1.4.0 tags: backend: triton batch_size: '32' gpu: A100-SXM4-40GB model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A100-SXM4-40GB - key: COUNT value: 1 - key: NIM VERSION value: 1.4.0 - key: DOWNLOAD SIZE value: 1GB - key: BACKEND value: TRITON - key: MODEL TYPE value: TENSORRT labels: - signed images - NSPECT-LDAL-INWI - NVIDIA AI Enterprise Supported - NVIDIA NIM config: architectures: - Other modelType: NIM license: NVIDIA AI Foundation Models Community License - name: Llama 3.2 NV EmbedQA 1b V2 displayName: Llama 3.2 NV EmbedQA 1b V2 modelHubID: nvidia/llama-3.2-nv-embedqa-1b-v2 category: Retrieval type: NGC description: Embedding model for multilingual and cross-lingual question-answering retrieval across 26 languages. Supports dynamic embedding sizing to significantly reduce storage footprint. requireLicense: true licenseAgreements: - label: Use Policy url: https://llama.meta.com/llama3/use-policy/ - label: License Agreement url: https://llama.meta.com/llama3/license/ modelVariants: - variantId: Llama 3.2 NV EmbedQA 1b V2 source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/llama-3.2-nv-embedqa-1b-v2 optimizationProfiles: - profileId: nim/nvidia/llama-3.2-nv-embedqa-1b-v2:onnx-precision.fp16-7c7a1c17 framework: ONNX displayName: Llama 3.2 NV Embedqa 1B V2 ONNX FP16 ngcMetadata: f7391ddbcb95b2406853526b8e489fedf20083a2420563ca3e65358ff417b10f: model: nvidia/llama-3.2-nv-embedqa-1b-v2 release: 1.10.0 tags: backend: onnx model_type: onnx precision: fp16 tp: '1' modelFormat: onnx spec: - key: PRECISION value: FP16 - key: COUNT value: 1 - key: NIM VERSION value: 1.10.0 - key: DOWNLOAD SIZE value: 3GB - key: BACKEND value: ONNX - key: MODEL TYPE value: ONNX - key: MAX TOKENS value: 8192 - key: TOTAL PARAMETERS value: 1236 - key: Embedding Dimension value: 2048 labels: - Llama - Meta - Chat - Large Language Model - NVIDIA Validated config: architectures: - Other modelType: llama license: NVIDIA AI Foundation Models Community License - name: Llama 3.2 NV RerankQA 1b V2 displayName: Llama 3.2 NV RerankQA 1b V2 modelHubID: nvidia/llama-3.2-nv-rerankqa-1b-v2 category: Retrieval type: NGC description: Reranking model that scores document relevance to improve retrieval accuracy. Fine-tuned for multilingual and cross-lingual use across 26 languages. requireLicense: true licenseAgreements: - label: Use Policy url: https://llama.meta.com/llama3/use-policy/ - label: License Agreement url: https://llama.meta.com/llama3/license/ modelVariants: - variantId: Llama 3.2 NV RerankQA 1b V2 source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/llama-3.2-nv-rerankqa-1b-v2 optimizationProfiles: - profileId: nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:h100x1-trt-fp16--ckqlv3j2g framework: TensorRT-LLM displayName: Llama 3.2 NV Rerankqa 1B V2 NVIDIA H100 NVLx1 FP16 ngcMetadata: 3b1e767e41d02ed0ffa5aa6b46a2edfdd1540edaec2eeda4c00278c838bba38b: model: nvidia/llama-3.2-nv-rerankqa-1b-v2 release: 1.8.0 tags: backend: tensorrt device_id: 2321:10de gpu: NVIDIA H100 NVL gpu_key: h100-nvl model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: H100-NVL - key: COUNT value: 1 - key: NIM VERSION value: 1.8.0 - key: DOWNLOAD SIZE value: 5GB - key: BACKEND value: TENSORRT - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:a100x1-trt-fp16-dxtbz8wstg framework: TensorRT-LLM displayName: Llama 3.2 NV Rerankqa 1B V2 NVIDIA A100-SXM4-40GBx1 FP16 ngcMetadata: 477500a740ea33ea1419289866bbfd598ce51a806fe034b48dc176db32155f59: model: nvidia/llama-3.2-nv-rerankqa-1b-v2 release: 1.8.0 tags: backend: tensorrt device_id: 20b0:10de gpu: NVIDIA A100-SXM4-40GB gpu_key: a100-sxm4-40gb model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A100-SXM4-40GB - key: COUNT value: 1 - key: NIM VERSION value: 1.8.0 - key: DOWNLOAD SIZE value: 3GB - key: BACKEND value: TENSORRT - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:l40sx1-trt-fp16-20qsn53gag framework: TensorRT-LLM displayName: Llama 3.2 NV Rerankqa 1B V2 NVIDIA L40Sx1 FP16 ngcMetadata: 49d14b4eaebc6b1f61e48afb3d88535f4ad3758ea55036f5ab3815d1c5a927fc: model: nvidia/llama-3.2-nv-rerankqa-1b-v2 release: 1.8.0 tags: backend: tensorrt device_id: 26b9:10de gpu: NVIDIA L40S gpu_key: l40s model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: L40S - key: COUNT value: 1 - key: NIM VERSION value: 1.8.0 - key: DOWNLOAD SIZE value: 3GB - key: BACKEND value: TENSORRT - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:a100x1-trt-fp16-dxtbz8wstg framework: TensorRT-LLM displayName: Llama 3.2 NV Rerankqa 1B V2 NVIDIA A100-SXM4-80GBx1 FP16 ngcMetadata: 4ea4624dcc114adeeb29272322897800cddf5dfa873dac467f67d827b7dd9c4d: model: nvidia/llama-3.2-nv-rerankqa-1b-v2 release: 1.8.0 tags: backend: tensorrt device_id: 20b2:10de gpu: NVIDIA A100-SXM4-80GB gpu_key: a100-sxm4-80gb model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A100-SXM4-80GB - key: COUNT value: 1 - key: NIM VERSION value: 1.8.0 - key: DOWNLOAD SIZE value: 3GB - key: BACKEND value: TENSORRT - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:l40sx1-trt-fp8-4nwnajwq4g framework: TensorRT-LLM displayName: Llama 3.2 NV Rerankqa 1B V2 NVIDIA L40Sx1 FP8 ngcMetadata: 5036ebf412fba4e54511ab4b3822ec7dfb9fd2c256c3100ad2ed9d2b4bda9f79: model: nvidia/llama-3.2-nv-rerankqa-1b-v2 release: 1.8.0 tags: backend: tensorrt device_id: 26b9:10de gpu: NVIDIA L40S gpu_key: l40s model_type: tensorrt precision: fp8 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP8 - key: GPU value: L40S - key: COUNT value: 1 - key: NIM VERSION value: 1.8.0 - key: DOWNLOAD SIZE value: 2GB - key: BACKEND value: TENSORRT - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:a10gx1-trt-fp16-fxo3knzn8w framework: TensorRT-LLM displayName: Llama 3.2 NV Rerankqa 1B V2 NVIDIA A10Gx1 FP16 ngcMetadata: 6f21ae4169cfe3c03cc92eb194713f5a3044ac2f61526edf632d0f9a5155b538: model: nvidia/llama-3.2-nv-rerankqa-1b-v2 release: 1.8.0 tags: backend: tensorrt device_id: 2237:10de gpu: NVIDIA A10G gpu_key: a10g model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: A10G - key: COUNT value: 1 - key: NIM VERSION value: 1.8.0 - key: DOWNLOAD SIZE value: 3GB - key: BACKEND value: TENSORRT - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:b200x1-trt-fp16-jiw0-uharg framework: TensorRT-LLM displayName: Llama 3.2 NV Rerankqa 1B V2 NVIDIA B200x1 FP16 ngcMetadata: 75b659320dada86548fb6af5d3adfe386df6c515969d71db4e76cd64375777e1: model: nvidia/llama-3.2-nv-rerankqa-1b-v2 release: 1.8.0 tags: backend: tensorrt device_id: 2901:10de gpu: NVIDIA B200 gpu_key: b200 model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: B200 - key: COUNT value: 1 - key: NIM VERSION value: 1.8.0 - key: DOWNLOAD SIZE value: 4GB - key: BACKEND value: TENSORRT - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:h100x1-trt-fp8-bm87q6egvq framework: TensorRT-LLM displayName: Llama 3.2 NV Rerankqa 1B V2 NVIDIA H100 80GB HBM3x1 FP8 ngcMetadata: 774e4d699d318f41630b51b4280cadecb184b9b2755b707aa74232f1ea642b2c: model: nvidia/llama-3.2-nv-rerankqa-1b-v2 release: 1.8.0 tags: backend: tensorrt device_id: 2330:10de gpu: NVIDIA H100 80GB HBM3 gpu_key: h100-hbm3-80gb model_type: tensorrt precision: fp8 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP8 - key: GPU value: H100-HBM3-80GB - key: COUNT value: 1 - key: NIM VERSION value: 1.8.0 - key: DOWNLOAD SIZE value: 2GB - key: BACKEND value: TENSORRT - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:l4x1-trt-fp16-bajefiwkra framework: TensorRT-LLM displayName: Llama 3.2 NV Rerankqa 1B V2 NVIDIA L4x1 FP16 ngcMetadata: 9278eac727396c9f6ab9b3d421748889b0686afd20a9cef12d1d16c39fcd6a9d: model: nvidia/llama-3.2-nv-rerankqa-1b-v2 release: 1.8.0 tags: backend: tensorrt device_id: 27b8:10de gpu: NVIDIA L4 gpu_key: l4 model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: L4 - key: COUNT value: 1 - key: NIM VERSION value: 1.8.0 - key: DOWNLOAD SIZE value: 3GB - key: BACKEND value: TENSORRT - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:l4x1-trt-fp8-vk0qdpls2w framework: TensorRT-LLM displayName: Llama 3.2 NV Rerankqa 1B V2 NVIDIA L4x1 FP8 ngcMetadata: a344745c8dbe62413a4e95b4e5718a689c155dfb8743868fb5d13956a621b31e: model: nvidia/llama-3.2-nv-rerankqa-1b-v2 release: 1.8.0 tags: backend: tensorrt device_id: 27b8:10de gpu: NVIDIA L4 gpu_key: l4 model_type: tensorrt precision: fp8 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP8 - key: GPU value: L4 - key: COUNT value: 1 - key: NIM VERSION value: 1.8.0 - key: DOWNLOAD SIZE value: 2GB - key: BACKEND value: TENSORRT - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:h100x1-trt-fp8-bm87q6egvq framework: TensorRT-LLM displayName: Llama 3.2 NV Rerankqa 1B V2 NVIDIA H100 NVLx1 FP8 ngcMetadata: b469c56c1a9ac1001151765527d3c7de77f590255b08eea4aa064ee1abf0ef3f: model: nvidia/llama-3.2-nv-rerankqa-1b-v2 release: 1.8.0 tags: backend: tensorrt device_id: 2321:10de gpu: NVIDIA H100 NVL gpu_key: h100-nvl model_type: tensorrt precision: fp8 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP8 - key: GPU value: H100-NVL - key: COUNT value: 1 - key: NIM VERSION value: 1.8.0 - key: DOWNLOAD SIZE value: 2GB - key: BACKEND value: TENSORRT - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:h100x1-trt-fp16--ckqlv3j2g framework: TensorRT-LLM displayName: Llama 3.2 NV Rerankqa 1B V2 NVIDIA H100 80GB HBM3x1 FP16 ngcMetadata: ddd9c5d1430631c0bd75c04b0c18e9b620219ad82c808a30d019be9cbcd618bd: model: nvidia/llama-3.2-nv-rerankqa-1b-v2 release: 1.8.0 tags: backend: tensorrt device_id: 2330:10de gpu: NVIDIA H100 80GB HBM3 gpu_key: h100-hbm3-80gb model_type: tensorrt precision: fp16 tp: '1' modelFormat: trt-llm spec: - key: PRECISION value: FP16 - key: GPU value: H100-HBM3-80GB - key: COUNT value: 1 - key: NIM VERSION value: 1.8.0 - key: DOWNLOAD SIZE value: 5GB - key: BACKEND value: TENSORRT - key: MODEL TYPE value: TENSORRT - profileId: nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:onnx-precision.fp16-d03bf375 framework: ONNX displayName: Llama 3.2 NV Rerankqa 1B V2 ONNX FP16 ngcMetadata: f7391ddbcb95b2406853526b8e489fedf20083a2420563ca3e65358ff417b10f: model: nvidia/llama-3.2-nv-rerankqa-1b-v2 release: 1.8.0 tags: backend: onnx model_type: onnx precision: fp16 tp: '1' modelFormat: onnx spec: - key: PRECISION value: FP16 - key: COUNT value: 1 - key: NIM VERSION value: 1.8.0 - key: DOWNLOAD SIZE value: 3GB - key: BACKEND value: ONNX - key: MODEL TYPE value: ONNX labels: - Llama - Meta - Chat - NIM - Large Language Model - NVIDIA Validated config: architectures: - Other modelType: llama license: NVIDIA AI Foundation Models Community License - name: Llama 3.2 Vision Instruct displayName: Llama 3.2 Vision Instruct modelHubID: llama-3.2-vision-instruct category: Vision & Multimodal type: NGC description: The Llama 3.2 Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image. requireLicense: true licenseAgreements: - label: Use Policy url: https://llama.meta.com/llama3/use-policy/ - label: License Agreement url: https://llama.meta.com/llama3/license/ modelVariants: - variantId: Llama 3.2 11B Vision Instruct source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama-3.2-11b-vision-instruct optimizationProfiles: - profileId: nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x2-bf16-latency.0.3.20143152 framework: TensorRT-LLM displayName: Llama 3.2 11B Vision Instruct H100 BF16 Latency sha: 126d5a664ba4b6b4557d5e0225b51a5e2ffbf9e9909bfe25ed203bec421ea2e5 ngcMetadata: 126d5a664ba4b6b4557d5e0225b51a5e2ffbf9e9909bfe25ed203bec421ea2e5: model: meta/llama-3.2-11b-vision-instruct release: 1.1.1 tags: gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: latency tp: '2' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'trtllm_engine/rank1.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x2-bf16-latency.0.3.20143152 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78 - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 20GB - profileId: nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-a10gx4-bf16-throughput.0.3.20143152 framework: TensorRT-LLM displayName: Llama 3.2 11B Vision Instruct A10G BF16 Throughput sha: 417611b3f9e2c25db671083acfcfd4c2340f511f3533838fc6366bb47960915c ngcMetadata: 417611b3f9e2c25db671083acfcfd4c2340f511f3533838fc6366bb47960915c: model: meta/llama-3.2-11b-vision-instruct release: 1.1.1 tags: gpu: A10G gpu_device: 2237:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: throughput tp: '4' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'trtllm_engine/rank1.engine' - !name 'trtllm_engine/rank2.engine' - !name 'trtllm_engine/rank3.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-a10gx4-bf16-throughput.0.3.20143152 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78 - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: BF16 - key: GPU value: A10G - key: COUNT value: 4 - key: GPU DEVICE value: 2237:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 20GB - profileId: nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-a10gx8-bf16-latency.0.3.20143152 framework: TensorRT-LLM displayName: Llama 3.2 11B Vision Instruct A10G BF16 Latency sha: 5a9f2d4459908cf6c5b5222e31b8df053c00354b5866f6ee3b8de7552a695524 ngcMetadata: 5a9f2d4459908cf6c5b5222e31b8df053c00354b5866f6ee3b8de7552a695524: model: meta/llama-3.2-11b-vision-instruct release: 1.1.1 tags: gpu: A10G gpu_device: 2237:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: latency tp: '8' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'trtllm_engine/rank1.engine' - !name 'trtllm_engine/rank2.engine' - !name 'trtllm_engine/rank3.engine' - !name 'trtllm_engine/rank4.engine' - !name 'trtllm_engine/rank5.engine' - !name 'trtllm_engine/rank6.engine' - !name 'trtllm_engine/rank7.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-a10gx8-bf16-latency.0.3.20143152 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78 - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: BF16 - key: GPU value: A10G - key: COUNT value: 8 - key: GPU DEVICE value: 2237:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 20GB - profileId: nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x2-fp8-latency.0.3.20143152 framework: TensorRT-LLM displayName: Llama 3.2 11B Vision Instruct H100 FP8 Latency sha: ab89f816413848c86e311123d2ed98af7bcda0c3624b0a6c4d43704b720585d5 ngcMetadata: ab89f816413848c86e311123d2ed98af7bcda0c3624b0a6c4d43704b720585d5: model: meta/llama-3.2-11b-vision-instruct release: 1.1.1 tags: gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: fp8 profile: latency tp: '2' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'trtllm_engine/rank1.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x2-fp8-latency.0.3.20143152 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78 - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 12GB - profileId: nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-a100x2-bf16-latency.0.3.20143152 framework: TensorRT-LLM displayName: Llama 3.2 11B Vision Instruct A100 BF16 Latency sha: ad16e693a234cf8eee85c43dd66ab4502c51ab0bc553af1644477a4f966bf5c6 ngcMetadata: ad16e693a234cf8eee85c43dd66ab4502c51ab0bc553af1644477a4f966bf5c6: model: meta/llama-3.2-11b-vision-instruct release: 1.1.1 tags: gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: latency tp: '2' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'trtllm_engine/rank1.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-a100x2-bf16-latency.0.3.20143152 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78 - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 2 - key: GPU DEVICE value: 20b2:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 20GB - profileId: nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-l40sx2-bf16-throughput.0.3.20143152 framework: TensorRT-LLM displayName: Llama 3.2 11B Vision Instruct L40S BF16 Throughput sha: b16d5969212a8cea632fd6f70928ab514aab835cf217281899564933e6cafa5b ngcMetadata: b16d5969212a8cea632fd6f70928ab514aab835cf217281899564933e6cafa5b: model: meta/llama-3.2-11b-vision-instruct release: 1.1.1 tags: gpu: L40S gpu_device: 26b5:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: throughput tp: '2' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'trtllm_engine/rank1.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-l40sx2-bf16-throughput.0.3.20143152 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78 - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 2 - key: GPU DEVICE value: 26b5:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 20GB - profileId: nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x1-bf16-throughput.0.3.20143152 framework: TensorRT-LLM displayName: Llama 3.2 11B Vision Instruct H100 BF16 Throughput sha: b7aa6bf9d9946de665480a5669bb73f981eab7c4fe43ddf7217b672eb11a003a ngcMetadata: b7aa6bf9d9946de665480a5669bb73f981eab7c4fe43ddf7217b672eb11a003a: model: meta/llama-3.2-11b-vision-instruct release: 1.1.1 tags: gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: throughput tp: '1' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x1-bf16-throughput.0.3.20143152 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78 - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 20GB - profileId: nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-l40sx4-bf16-latency.0.3.20143152 framework: TensorRT-LLM displayName: Llama 3.2 11B Vision Instruct L40S BF16 Latency sha: be5af3c968ce6bc45e740edc985fa05dffd3695abb7cc5723407e1f5e3f12c70 ngcMetadata: be5af3c968ce6bc45e740edc985fa05dffd3695abb7cc5723407e1f5e3f12c70: model: meta/llama-3.2-11b-vision-instruct release: 1.1.1 tags: gpu: L40S gpu_device: 26b5:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: latency tp: '4' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'trtllm_engine/rank1.engine' - !name 'trtllm_engine/rank2.engine' - !name 'trtllm_engine/rank3.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-l40sx4-bf16-latency.0.3.20143152 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78 - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 4 - key: GPU DEVICE value: 26b5:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 20GB - profileId: nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-a100x1-bf16-throughput.0.3.20143152 framework: TensorRT-LLM displayName: Llama 3.2 11B Vision Instruct A100 BF16 Throughput sha: ee1e936b878082dee74574deae5064cc7fba3e11ba155de1198ee544d7c3468a ngcMetadata: ee1e936b878082dee74574deae5064cc7fba3e11ba155de1198ee544d7c3468a: model: meta/llama-3.2-11b-vision-instruct release: 1.1.1 tags: gpu: A100 gpu_device: 20b2:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: throughput tp: '1' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-a100x1-bf16-throughput.0.3.20143152 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78 - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: BF16 - key: GPU value: A100 - key: COUNT value: 1 - key: GPU DEVICE value: 20b2:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 20GB - profileId: nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x1-fp8-throughput.0.3.20143152 framework: TensorRT-LLM displayName: Llama 3.2 11B Vision Instruct H100 FP8 Throughput sha: fa1e1cbf698be85c0cc56d707f8bc5b17044e091136dae3f8e4be694af727c87 ngcMetadata: fa1e1cbf698be85c0cc56d707f8bc5b17044e091136dae3f8e4be694af727c87: model: meta/llama-3.2-11b-vision-instruct release: 1.1.1 tags: gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: fp8 profile: throughput tp: '1' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x1-fp8-throughput.0.3.20143152 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78 - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:hf-cee5b78-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-11b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 12GB - variantId: Llama 3.2 90B Vision Instruct source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama-3.2-90b-vision-instruct optimizationProfiles: - profileId: nim/meta/llama-3.2-90b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x4-bf16-throughput.0.3.20194742 framework: TensorRT-LLM displayName: Llama 3.2 90B Vision Instruct H100 BF16 Throughput sha: 42c91902414bc5ea7f4ef4e9a34ef382165b8b65f9adcc5d1abaf195ade2d8fc ngcMetadata: 42c91902414bc5ea7f4ef4e9a34ef382165b8b65f9adcc5d1abaf195ade2d8fc: model: meta/llama-3.2-90b-vision-instruct release: 1.1.1 tags: gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: throughput tp: '4' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'trtllm_engine/rank1.engine' - !name 'trtllm_engine/rank2.engine' - !name 'trtllm_engine/rank3.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x4-bf16-throughput.0.3.20194742 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:hf-0a6d69b - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:hf-0a6d69b-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 166GB - profileId: nim/meta/llama-3.2-90b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x2-fp8-throughput.0.3.20194742 framework: TensorRT-LLM displayName: Llama 3.2 90B Vision Instruct H100 FP8 Throughput sha: 6b24bf2e19c23b85f9d2651efdc2de08cd179a03c50f942b1dcd856fa4d4074b ngcMetadata: 6b24bf2e19c23b85f9d2651efdc2de08cd179a03c50f942b1dcd856fa4d4074b: model: meta/llama-3.2-90b-vision-instruct release: 1.1.1 tags: gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: fp8 profile: throughput tp: '2' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'trtllm_engine/rank1.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x2-fp8-throughput.0.3.20194742 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:hf-0a6d69b - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:hf-0a6d69b-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 2 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 85GB - profileId: nim/meta/llama-3.2-90b-vision-instruct:0.15.0.dev2024102300+ea8391c56-l40sx8-bf16-throughput.0.3.1342 framework: TensorRT-LLM displayName: Llama 3.2 90B Vision Instruct L40S BF16 Throughput sha: 7bb72cbd19b5eab69ed21b2e031e4ea18909ff034255471c25b29ab45a99cc8b ngcMetadata: 7bb72cbd19b5eab69ed21b2e031e4ea18909ff034255471c25b29ab45a99cc8b: model: meta/llama-3.2-90b-vision-instruct release: 1.1.1 tags: gpu: L40S gpu_device: 26b5:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: throughput tp: '8' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'trtllm_engine/rank1.engine' - !name 'trtllm_engine/rank2.engine' - !name 'trtllm_engine/rank3.engine' - !name 'trtllm_engine/rank4.engine' - !name 'trtllm_engine/rank5.engine' - !name 'trtllm_engine/rank6.engine' - !name 'trtllm_engine/rank7.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:0.15.0.dev2024102300+ea8391c56-l40sx8-bf16-throughput.0.3.1342 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:hf-0a6d69b - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:hf-0a6d69b-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Throughput - key: PRECISION value: BF16 - key: GPU value: L40S - key: COUNT value: 8 - key: GPU DEVICE value: 26b5:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 166GB - profileId: nim/meta/llama-3.2-90b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x4-fp8-latency.0.3.20194742 framework: TensorRT-LLM displayName: Llama 3.2 90B Vision Instruct H100 FP8 Latency sha: a6e9fde5c1edfb4ab4c0b206a536693a6f9b1f95cde1448ddd679fb880fcef71 ngcMetadata: a6e9fde5c1edfb4ab4c0b206a536693a6f9b1f95cde1448ddd679fb880fcef71: model: meta/llama-3.2-90b-vision-instruct release: 1.1.1 tags: gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: fp8 profile: latency tp: '4' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'trtllm_engine/rank1.engine' - !name 'trtllm_engine/rank2.engine' - !name 'trtllm_engine/rank3.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x4-fp8-latency.0.3.20194742 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:hf-0a6d69b - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:hf-0a6d69b-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: FP8 - key: GPU value: H100 - key: COUNT value: 4 - key: GPU DEVICE value: null - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 87GB - profileId: nim/meta/llama-3.2-90b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x8-bf16-latency.0.3.20194742 framework: TensorRT-LLM displayName: Llama 3.2 90B Vision Instruct H100 BF16 Latency sha: e994500d8b0e10f63a08e6a90143a60c360d004f6d5ea8bdb4d38d215eb3fa83 ngcMetadata: e994500d8b0e10f63a08e6a90143a60c360d004f6d5ea8bdb4d38d215eb3fa83: model: meta/llama-3.2-90b-vision-instruct release: 1.1.1 tags: gpu: H100 gpu_device: 2330:10de llm_engine: tensorrt_llm pp: '1' precision: bf16 profile: latency tp: '8' workspace: !workspace components: - dst: '' src: files: - !name 'LICENSE.txt' - !name 'NOTICE.txt' - !name 'checksums.blake3' - !name 'trtllm_engine/config.json' - !name 'trtllm_engine/metadata.json' - !name 'trtllm_engine/rank0.engine' - !name 'trtllm_engine/rank1.engine' - !name 'trtllm_engine/rank2.engine' - !name 'trtllm_engine/rank3.engine' - !name 'trtllm_engine/rank4.engine' - !name 'trtllm_engine/rank5.engine' - !name 'trtllm_engine/rank6.engine' - !name 'trtllm_engine/rank7.engine' - !name 'visual_engine/config.json' - !name 'visual_engine/metadata.json' - !name 'visual_engine/visual_encoder.engine' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:0.15.0.dev2024102300+ea8391c56-h100x8-bf16-latency.0.3.20194742 - dst: '' src: files: - !name 'config.json' - !name 'generation_config.json' - !name 'preprocessor_config.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:hf-0a6d69b - dst: '' src: files: - !name 'special_tokens_map.json' - !name 'tokenizer.json' - !name 'tokenizer_config.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:hf-0a6d69b-tok - dst: visual_engine src: files: - !name 'vision_processor.py' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:vision-processor - dst: '' src: files: - !name 'runtime_params.json' repo_id: ngc://nim/meta/llama-3.2-90b-vision-instruct:runtime-params-trtllm modelFormat: trt-llm spec: - key: PROFILE value: Latency - key: PRECISION value: BF16 - key: GPU value: H100 - key: COUNT value: 8 - key: GPU DEVICE value: 2330:10de - key: NIM VERSION value: 1.1.1 - key: DOWNLOAD SIZE value: 166GB labels: - Llama - Meta - Chat - Large Language Model - TensorRT-LLM - Vision Instruct - Image to Text Generation - Language Generation - NeMo - NVIDIA Validated config: architectures: - Other modelType: llama license: NVIDIA AI Foundation Models Community License - name: Riva ASR Whisper Large v3 displayName: Riva ASR Whisper Large v3 modelHubID: nvidia/riva-asr/whisper category: Speech type: NGC description: Automatic speech recognition and speech translation model trained on 680K hours of labeled audio. Generalizes well across diverse datasets and domains without fine-tuning. requireLicense: true licenseAgreements: - label: Use Policy url: https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/ - label: License Agreement url: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/ modelVariants: - variantId: Riva ASR Whisper Large v3 source: URL: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/riva/models/whisper_large optimizationProfiles: - profileId: nim/nvidia/whisper-large-v3:ofl-rmir-25.06 framework: TensorRT-LLM displayName: Riva ASR Whisper Large v3 Generic NVIDIA GPUx1 ngcMetadata: 5e44fa6d8cd80ad46a089089157ff4565974f0a64fd37c594265c61f00418ae0: model: nvidia/riva-asr/whisper release: 1.3.1 tags: mode: ofl model_type: rmir name: whisper-large-v3 tp: '1' modelFormat: trt-llm spec: - key: COUNT value: 1 - key: NIM VERSION value: 1.3.1 - key: DOWNLOAD SIZE value: 3GB - key: MODEL TYPE value: RMIR - key: MODE value: OFL - profileId: nim/nvidia/whisper-large-v3:h100x1-ofl-25.08-fp16-mnz4pnn0pw framework: TensorRT-LLM displayName: Riva ASR Whisper Large v3 H100 FP16 ngcMetadata: 72232937075119887298deb92b5e58f4d98a0ce0948df60d424f0d97b05da55e: model: nvidia/riva-asr/whisper release: 1.3.1 tags: gpu_device: '2330' mode: ofl model_type: prebuilt name: whisper-large-v3 gpu: H100 tp: '1' modelFormat: trt-llm spec: - key: GPU value: H100 - key: COUNT value: 1 - key: GPU DEVICE value: 2330 - key: NIM VERSION value: 1.3.1 - key: DOWNLOAD SIZE value: 2GB - key: MODEL TYPE value: PREBUILT - key: MODE value: OFL labels: - Transformer - TensorRT-LLM - Audio - NVIDIA Validated config: architectures: - Other modelType: llama license: NVIDIA AI Foundation Models Community License - name: Boltz2 displayName: Boltz2 modelHubID: boltz2 category: Science type: NGC description: Structural biology foundation model for protein structure prediction and binding affinity estimation. Approaches the accuracy of gold-standard computational methods at a fraction of the cost. requireLicense: true licenseAgreements: - label: Use Policy url: https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/ - label: License Agreement url: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/ modelVariants: - variantId: Boltz2 source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/mit/containers/boltz2 optimizationProfiles: - profileId: nim/mit/boltz2:1.1.0-combi-gpuA100_sm80_v11 framework: TensorRT-LLM displayName: Boltz2 A100x1 SM80 V11 ngcMetadata: 9c411ee75a91c41ae5e85ae81e91ab9fcdde58477da17785d188e0de41a54a8e: model: mit/boltz2 release: 1.1.0 tags: gpu: A100 sm: '80' v: '11' tp: '1' modelFormat: trt-llm spec: - key: GPU value: A100 - key: COUNT value: 1 - key: NIM VERSION value: 1.1.0 - key: DOWNLOAD SIZE value: 13GB - key: SM value: '80' - key: V value: '11' - profileId: nim/mit/boltz2:1.1.0-combi-gpuB200_sm100_v11 framework: TensorRT-LLM displayName: Boltz2 B200x1 SM100 V11 ngcMetadata: c9f95e6e506df04cf37ff5b7a70f2cde0eed8b61f43a9dc5c857d4d6fdbe4c78: model: mit/boltz2 release: 1.1.0 tags: gpu: B200 sm: '100' v: '11' tp: '1' modelFormat: trt-llm spec: - key: GPU value: B200 - key: COUNT value: 1 - key: NIM VERSION value: 1.1.0 - key: DOWNLOAD SIZE value: 12GB - key: SM value: '100' - key: V value: '11' - profileId: nim/mit/boltz2:1.1.0-combi-gpuL40S_sm89_v11 framework: TensorRT-LLM displayName: Boltz2 L40Sx1 SM89 V11 ngcMetadata: f429a14a6470d83cd68dd11dc45cbd69c235636d5f82f147cad04b927e86be56: model: mit/boltz2 release: 1.1.0 tags: gpu: L40S sm: '89' v: '11' tp: '1' modelFormat: trt-llm spec: - key: GPU value: L40S - key: COUNT value: 1 - key: NIM VERSION value: 1.1.0 - key: DOWNLOAD SIZE value: 14GB - key: SM value: '89' - key: V value: '11' - profileId: nim/mit/boltz2:1.1.0-combi-gpuH100_sm90_v11 framework: TensorRT-LLM displayName: Boltz2 H100x1 SM90 V11 ngcMetadata: f6884d58e3b6cf070d18085a79760da0dd36669aaa52900e8f5b50eebd8f304c: model: mit/boltz2 release: 1.1.0 tags: gpu: H100 sm: '90' v: '11' tp: '1' modelFormat: trt-llm spec: - key: GPU value: H100 - key: COUNT value: 1 - key: NIM VERSION value: 1.1.0 - key: DOWNLOAD SIZE value: 13GB - key: SM value: '90' - key: V value: '11' labels: - Biology Foundation Model - signed images - NSPECT-D4IX-8I2O - NVIDIA AI Enterprise Supported - NVIDIA NIM config: architectures: - Other modelType: NIM license: NVIDIA AI Foundation Models Community License - name: GPT-OSS displayName: GPT-OSS modelHubID: gpt-oss category: Language type: NGC description: OpenAI's open-weight Mixture-of-Experts models for reasoning and agentic tasks. Both 20B and 120B variants support chain-of-thought reasoning, tuned for different latency and throughput tradeoffs. requireLicense: true licenseAgreements: - label: Use Policy url: https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/ - label: License Agreement url: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/ modelVariants: - variantId: GPT-OSS 120B source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/openai/containers/gpt-oss-120b optimizationProfiles: - profileId: nim/openai/gpt-oss-120b:v1 framework: VLLM displayName: GPT-OSS 120B MXFP4 ngcMetadata: 46b15913be22333eb518656e23ce6e4add9ef248521868cca0bb31af8c99458c: model: openai/gpt-oss-120b release: 1.12.1 tags: feat_lora: 'false' llm_engine: vllm nim_workspace_hash_v1: 0d2325f68f4fecf18db654a22e7647c17ee6003b0f8c605d29675be643727b60 pp: '1' precision: mxfp4 tp: '8' modelFormat: vllm spec: - key: PRECISION value: MXFP4 - key: COUNT value: 8 - key: NIM VERSION value: 1.12.1 - key: DOWNLOAD SIZE value: 7GB - profileId: nim/openai/gpt-oss-120b:v1 framework: VLLM displayName: GPT-OSS 120B Generic NVIDIA GPUx2 MXFP4 ngcMetadata: 4a0d3557b2676290857e191d9b71eeff964f4fe10a7ddac66c66d2fc9983c399: model: openai/gpt-oss-120b release: 1.12.1 tags: feat_lora: 'false' llm_engine: vllm nim_workspace_hash_v1: 0d2325f68f4fecf18db654a22e7647c17ee6003b0f8c605d29675be643727b60 pp: '1' precision: mxfp4 tp: '2' modelFormat: vllm spec: - key: PRECISION value: MXFP4 - key: COUNT value: 2 - key: NIM VERSION value: 1.12.1 - key: DOWNLOAD SIZE value: 7GB - profileId: nim/openai/gpt-oss-120b:v1 framework: VLLM displayName: GPT-OSS 120B Generic NVIDIA GPUx4 MXFP4 ngcMetadata: 531bca8ac3e457ae8a69a62780430eab361264dbd29b79f2da474084a93ee000: model: openai/gpt-oss-120b release: 1.12.1 tags: feat_lora: 'false' llm_engine: vllm nim_workspace_hash_v1: 0d2325f68f4fecf18db654a22e7647c17ee6003b0f8c605d29675be643727b60 pp: '1' precision: mxfp4 tp: '4' modelFormat: vllm spec: - key: PRECISION value: MXFP4 - key: COUNT value: 4 - key: NIM VERSION value: 1.12.1 - key: DOWNLOAD SIZE value: 7GB - profileId: nim/openai/gpt-oss-120b:v1 framework: VLLM displayName: GPT-OSS 120B Generic NVIDIA GPUx1 MXFP4 ngcMetadata: 899cf2e170bb3af67ba7fef534eda3eeb8e3b69c415cd1ec4872752f6c64e6a0: model: openai/gpt-oss-120b release: 1.12.1 tags: feat_lora: 'false' llm_engine: vllm nim_workspace_hash_v1: 0d2325f68f4fecf18db654a22e7647c17ee6003b0f8c605d29675be643727b60 pp: '1' precision: mxfp4 tp: '1' modelFormat: vllm spec: - key: PRECISION value: MXFP4 - key: COUNT value: 1 - key: NIM VERSION value: 1.12.1 - key: DOWNLOAD SIZE value: 7GB - variantId: GPT-OSS 20B source: URL: https://catalog.ngc.nvidia.com/orgs/nim/teams/openai/containers/gpt-oss-20b optimizationProfiles: - profileId: nim/openai/gpt-oss-20b:v1 framework: VLLM displayName: GPT-OSS 20B Generic NVIDIA GPUx8 MXFP4 ngcMetadata: 2e2c8811172659dd321e19a927ec632eaba747f3e0017bacc8ad27109e75878a: model: openai/gpt-oss-20b release: 1.12.1 tags: feat_lora: 'false' llm_engine: vllm nim_workspace_hash_v1: 1a244f5794c2c5706bd813ad4d1b003065f81939741d67ccc0bd11f9c963f5db pp: '1' precision: mxfp4 tp: '8' modelFormat: vllm spec: - key: PRECISION value: MXFP4 - key: COUNT value: 8 - key: NIM VERSION value: 1.12.1 - key: DOWNLOAD SIZE value: 7GB - profileId: nim/openai/gpt-oss-20b:v1 framework: VLLM displayName: GPT-OSS 20B Generic NVIDIA GPUx1 MXFP4 ngcMetadata: 319230871747dcf65aac0a4af04e603297460c2eab06611b7a7e6a41ae885da8: model: openai/gpt-oss-20b release: 1.12.1 tags: feat_lora: 'false' llm_engine: vllm nim_workspace_hash_v1: 1a244f5794c2c5706bd813ad4d1b003065f81939741d67ccc0bd11f9c963f5db pp: '1' precision: mxfp4 tp: '1' modelFormat: vllm spec: - key: PRECISION value: MXFP4 - key: COUNT value: 1 - key: NIM VERSION value: 1.12.1 - key: DOWNLOAD SIZE value: 7GB - profileId: nim/openai/gpt-oss-20b:v1 framework: VLLM displayName: GPT-OSS 20B Generic NVIDIA GPUx4 MXFP4 ngcMetadata: bfa3fc92db1f01459fe0dea1886ad7f89366a04208a24debb1ad13747fe1cb9b: model: openai/gpt-oss-20b release: 1.12.1 tags: feat_lora: 'false' llm_engine: vllm nim_workspace_hash_v1: 1a244f5794c2c5706bd813ad4d1b003065f81939741d67ccc0bd11f9c963f5db pp: '1' precision: mxfp4 tp: '4' modelFormat: vllm spec: - key: PRECISION value: MXFP4 - key: COUNT value: 4 - key: NIM VERSION value: 1.12.1 - key: DOWNLOAD SIZE value: 7GB - profileId: nim/openai/gpt-oss-20b:v1 framework: VLLM displayName: GPT-OSS 20B Generic NVIDIA GPUx2 MXFP4 ngcMetadata: efa7e9795d36904b4466f82f6b32c28507ed700b2fef0be0b2e7c1de10a2eeb8: model: openai/gpt-oss-20b release: 1.12.1 tags: feat_lora: 'false' llm_engine: vllm nim_workspace_hash_v1: 1a244f5794c2c5706bd813ad4d1b003065f81939741d67ccc0bd11f9c963f5db pp: '1' precision: mxfp4 tp: '2' modelFormat: vllm spec: - key: PRECISION value: MXFP4 - key: COUNT value: 2 - key: NIM VERSION value: 1.12.1 - key: DOWNLOAD SIZE value: 7GB labels: - OpenAI - signed images - NSPECT-LJGD-9W15 - NVIDIA AI Enterprise Supported - NVIDIA NIM config: architectures: - Other modelType: NIM license: NVIDIA AI Foundation Models Community License - name: Gemma 2 displayName: Gemma 2 modelHubID: gemma-2 category: Language type: HF description: Gemma 2 the second generation of the Google community Gemma lineage. Gemma 2 is improved with higher performance with significant safety improvements and well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. modelVariants: - variantId: Gemma 2 9B displayName: Gemma 2 9B source: URL: https://huggingface.co/google/gemma-2-9b requireToken: true requireLicense: true licenseAgreements: - label: License Agreement url: https://ai.google.dev/gemma/terms - label: Use Policy url: https://ai.google.dev/gemma/prohibited_use_policy optimizationProfiles: - profileId: google/gemma-2-9b displayName: Gemma 2 9b A10G framework: vllm sha: vllm modelFormat: vllm spec: - key: GPU value: A10G - key: COUNT value: 1 - profileId: google/gemma-2-9b displayName: Gemma 2 A100 framework: vllm sha: vllm modelFormat: vllm spec: - key: GPU value: A100 - key: COUNT value: 1 - profileId: google/gemma-2-9b displayName: Gemma 2 9b L40S framework: vllm sha: vllm modelFormat: vllm spec: - key: GPU value: L40S - key: COUNT value: 1 labels: - google - Gemma - "Text Generation" - "Multilingual support" config: architectures: - Gemma2ForCausalLM modelType: Gemma2 license: gemma - name: Llama 3 SQLCoder displayName: Llama 3 SQLCoder modelHubID: llama-3-sqlcoder-8b category: Code type: HF description: Text-to-SQL model supporting Postgres, Redshift, and Snowflake dialects. Frontier-level accuracy for natural language database querying. modelVariants: - variantId: Llama 3 SQLCoder 8B displayName: Llama 3 SQLCoder 8B source: URL: https://huggingface.co/defog/llama-3-sqlcoder-8b requireToken: false requireLicense: false licenseAgreements: - label: License Agreement url: https://choosealicense.com/licenses/cc-by-sa-4.0/ optimizationProfiles: - profileId: Defog/Llama-3-sqlcoder-8B displayName: Llama 3 SQLCoder 8B A10G framework: vllm sha: vllm modelFormat: vllm spec: - key: GPU value: A10G - key: COUNT value: 1 - profileId: Defog/Llama-3-sqlcoder-8B displayName: Llama 3 SQLCoder 8B A100 framework: vllm sha: vllm modelFormat: vllm spec: - key: GPU value: A100 - key: COUNT value: 1 - profileId: Defog/Llama-3-sqlcoder-8B displayName: Llama 3 SQLCoder 8B L40S framework: vllm sha: vllm modelFormat: vllm spec: - key: GPU value: L40S - key: COUNT value: 1 labels: - Llama - "Text To SQL" - "Code Generation" - "Fine Tuned" config: architectures: - LlamaForCausalLM modelType: llama license: Creative Commons Attribution Share Alike 4.0