{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://developer.nvidia.com/schemas/triton/model.json",
  "title": "Triton Inference Server Model",
  "description": "A machine learning model managed by NVIDIA Triton Inference Server, including its configuration, input/output tensor definitions, batching strategy, and deployment settings.",
  "type": "object",
  "required": ["name"],
  "properties": {
    "name": {
      "type": "string",
      "description": "Unique name of the model within the model repository",
      "minLength": 1
    },
    "platform": {
      "type": "string",
      "description": "Framework platform of the model",
      "enum": [
        "tensorrt_plan",
        "tensorflow_graphdef",
        "tensorflow_savedmodel",
        "onnxruntime_onnx",
        "pytorch_libtorch",
        "python",
        "ensemble"
      ]
    },
    "backend": {
      "type": "string",
      "description": "Backend used by the model for inference execution",
      "examples": ["tensorrt", "tensorflow", "onnxruntime", "pytorch", "python", "openvino", "fil"]
    },
    "version_policy": {
      "$ref": "#/$defs/VersionPolicy"
    },
    "max_batch_size": {
      "type": "integer",
      "minimum": 0,
      "description": "Maximum batch size supported by the model. A value of 0 means batching is disabled."
    },
    "input": {
      "type": "array",
      "description": "Input tensor definitions for the model",
      "items": {
        "$ref": "#/$defs/TensorConfig"
      }
    },
    "output": {
      "type": "array",
      "description": "Output tensor definitions for the model",
      "items": {
        "$ref": "#/$defs/TensorConfig"
      }
    },
    "instance_group": {
      "type": "array",
      "description": "Instance group configurations specifying how model instances are deployed across devices",
      "items": {
        "$ref": "#/$defs/InstanceGroup"
      }
    },
    "dynamic_batching": {
      "$ref": "#/$defs/DynamicBatching"
    },
    "sequence_batching": {
      "$ref": "#/$defs/SequenceBatching"
    },
    "ensemble_scheduling": {
      "$ref": "#/$defs/EnsembleScheduling"
    },
    "parameters": {
      "type": "object",
      "description": "Custom key-value parameters for the model",
      "additionalProperties": {
        "type": "object",
        "properties": {
          "string_value": {
            "type": "string"
          }
        }
      }
    },
    "model_warmup": {
      "type": "array",
      "description": "Warmup configurations to pre-heat the model after loading",
      "items": {
        "$ref": "#/$defs/ModelWarmup"
      }
    },
    "optimization": {
      "$ref": "#/$defs/Optimization"
    },
    "response_cache": {
      "type": "object",
      "description": "Response cache configuration",
      "properties": {
        "enable": {
          "type": "boolean",
          "description": "Whether response caching is enabled for this model"
        }
      }
    }
  },
  "$defs": {
    "VersionPolicy": {
      "type": "object",
      "description": "Policy for selecting which model versions are available for inference",
      "properties": {
        "latest": {
          "type": "object",
          "description": "Serve the N most recent versions",
          "properties": {
            "num_versions": {
              "type": "integer",
              "minimum": 1,
              "description": "Number of latest versions to serve"
            }
          }
        },
        "all": {
          "type": "object",
          "description": "Serve all available versions"
        },
        "specific": {
          "type": "object",
          "description": "Serve only the specified versions",
          "properties": {
            "versions": {
              "type": "array",
              "items": {
                "type": "integer"
              },
              "description": "List of specific version numbers to serve"
            }
          }
        }
      }
    },
    "TensorConfig": {
      "type": "object",
      "description": "Configuration for a model input or output tensor",
      "required": ["name", "data_type", "dims"],
      "properties": {
        "name": {
          "type": "string",
          "description": "Name of the tensor"
        },
        "data_type": {
          "type": "string",
          "description": "Data type of the tensor elements",
          "enum": [
            "TYPE_BOOL",
            "TYPE_UINT8",
            "TYPE_UINT16",
            "TYPE_UINT32",
            "TYPE_UINT64",
            "TYPE_INT8",
            "TYPE_INT16",
            "TYPE_INT32",
            "TYPE_INT64",
            "TYPE_FP16",
            "TYPE_FP32",
            "TYPE_FP64",
            "TYPE_STRING",
            "TYPE_BF16"
          ]
        },
        "dims": {
          "type": "array",
          "description": "Tensor dimensions. Use -1 for variable-length dimensions.",
          "items": {
            "type": "integer"
          }
        },
        "reshape": {
          "type": "object",
          "description": "Optional reshape configuration for the tensor",
          "properties": {
            "shape": {
              "type": "array",
              "items": {
                "type": "integer"
              }
            }
          }
        },
        "is_shape_tensor": {
          "type": "boolean",
          "description": "Whether this tensor is a shape tensor",
          "default": false
        },
        "allow_ragged_batch": {
          "type": "boolean",
          "description": "Whether ragged batching is allowed for this tensor",
          "default": false
        }
      }
    },
    "InstanceGroup": {
      "type": "object",
      "description": "Defines a group of model instances deployed on specific devices",
      "properties": {
        "name": {
          "type": "string",
          "description": "Name of the instance group"
        },
        "kind": {
          "type": "string",
          "description": "Device type for the instance group",
          "enum": ["KIND_AUTO", "KIND_GPU", "KIND_CPU", "KIND_MODEL"],
          "default": "KIND_AUTO"
        },
        "count": {
          "type": "integer",
          "minimum": 1,
          "description": "Number of instances in this group",
          "default": 1
        },
        "gpus": {
          "type": "array",
          "description": "GPU device IDs to use for this instance group",
          "items": {
            "type": "integer",
            "minimum": 0
          }
        },
        "rate_group": {
          "type": "integer",
          "description": "Rate limiter group assignment"
        },
        "rate_limit": {
          "type": "object",
          "description": "Rate limiting configuration for the instance group",
          "properties": {
            "resources": {
              "type": "array",
              "items": {
                "type": "object",
                "properties": {
                  "name": {
                    "type": "string"
                  },
                  "global": {
                    "type": "boolean"
                  },
                  "count": {
                    "type": "integer"
                  }
                }
              }
            }
          }
        }
      }
    },
    "DynamicBatching": {
      "type": "object",
      "description": "Dynamic batching configuration for combining multiple inference requests into a single batch",
      "properties": {
        "preferred_batch_size": {
          "type": "array",
          "description": "Preferred batch sizes for dynamic batching",
          "items": {
            "type": "integer",
            "minimum": 1
          }
        },
        "max_queue_delay_microseconds": {
          "type": "integer",
          "minimum": 0,
          "description": "Maximum delay in microseconds to wait for forming a preferred batch"
        },
        "preserve_ordering": {
          "type": "boolean",
          "description": "Whether to preserve the ordering of responses",
          "default": false
        },
        "priority_levels": {
          "type": "integer",
          "minimum": 0,
          "description": "Number of priority levels for request scheduling"
        },
        "default_priority_level": {
          "type": "integer",
          "description": "Default priority level for requests"
        },
        "default_queue_policy": {
          "$ref": "#/$defs/QueuePolicy"
        },
        "priority_queue_policy": {
          "type": "object",
          "description": "Per-priority-level queue policies",
          "additionalProperties": {
            "$ref": "#/$defs/QueuePolicy"
          }
        }
      }
    },
    "QueuePolicy": {
      "type": "object",
      "description": "Queue management policy for inference requests",
      "properties": {
        "timeout_action": {
          "type": "string",
          "enum": ["REJECT", "DELAY"],
          "description": "Action to take when a request times out in the queue"
        },
        "default_timeout_microseconds": {
          "type": "integer",
          "minimum": 0,
          "description": "Default timeout in microseconds for queued requests"
        },
        "allow_timeout_override": {
          "type": "boolean",
          "description": "Whether requests can override the default timeout"
        },
        "max_queue_size": {
          "type": "integer",
          "minimum": 0,
          "description": "Maximum number of requests in the queue"
        }
      }
    },
    "SequenceBatching": {
      "type": "object",
      "description": "Sequence batching configuration for stateful models that process ordered sequences of requests",
      "properties": {
        "max_sequence_idle_microseconds": {
          "type": "integer",
          "minimum": 0,
          "description": "Maximum idle time for a sequence before it is automatically ended"
        },
        "control_input": {
          "type": "array",
          "description": "Control inputs for sequence management",
          "items": {
            "type": "object",
            "properties": {
              "name": {
                "type": "string"
              },
              "control": {
                "type": "array",
                "items": {
                  "type": "object",
                  "properties": {
                    "kind": {
                      "type": "string",
                      "enum": [
                        "CONTROL_SEQUENCE_START",
                        "CONTROL_SEQUENCE_READY",
                        "CONTROL_SEQUENCE_END",
                        "CONTROL_SEQUENCE_CORRID"
                      ]
                    },
                    "int32_false_true": {
                      "type": "array",
                      "items": { "type": "integer" }
                    },
                    "fp32_false_true": {
                      "type": "array",
                      "items": { "type": "number" }
                    },
                    "bool_false_true": {
                      "type": "array",
                      "items": { "type": "boolean" }
                    }
                  }
                }
              }
            }
          }
        },
        "state": {
          "type": "array",
          "description": "Implicit state configurations for the sequence",
          "items": {
            "type": "object",
            "properties": {
              "input_name": { "type": "string" },
              "output_name": { "type": "string" },
              "data_type": { "type": "string" },
              "dims": {
                "type": "array",
                "items": { "type": "integer" }
              }
            }
          }
        }
      }
    },
    "EnsembleScheduling": {
      "type": "object",
      "description": "Ensemble model scheduling configuration defining a pipeline of models",
      "properties": {
        "step": {
          "type": "array",
          "description": "Steps in the ensemble pipeline",
          "items": {
            "type": "object",
            "required": ["model_name"],
            "properties": {
              "model_name": {
                "type": "string",
                "description": "Name of the model in this step"
              },
              "model_version": {
                "type": "integer",
                "description": "Version of the model to use (-1 for latest)"
              },
              "input_map": {
                "type": "object",
                "description": "Mapping from ensemble tensor names to step model input names",
                "additionalProperties": { "type": "string" }
              },
              "output_map": {
                "type": "object",
                "description": "Mapping from step model output names to ensemble tensor names",
                "additionalProperties": { "type": "string" }
              }
            }
          }
        }
      }
    },
    "ModelWarmup": {
      "type": "object",
      "description": "Model warmup configuration for pre-heating the model",
      "properties": {
        "name": {
          "type": "string",
          "description": "Name of the warmup configuration"
        },
        "batch_size": {
          "type": "integer",
          "minimum": 1,
          "description": "Batch size to use for warmup"
        },
        "inputs": {
          "type": "object",
          "description": "Input data specifications for warmup",
          "additionalProperties": {
            "type": "object",
            "properties": {
              "data_type": { "type": "string" },
              "dims": {
                "type": "array",
                "items": { "type": "integer" }
              },
              "zero_data": { "type": "boolean" },
              "random_data": { "type": "boolean" }
            }
          }
        },
        "count": {
          "type": "integer",
          "minimum": 1,
          "description": "Number of warmup iterations"
        }
      }
    },
    "Optimization": {
      "type": "object",
      "description": "Model optimization settings",
      "properties": {
        "priority": {
          "type": "string",
          "enum": ["PRIORITY_DEFAULT", "PRIORITY_MIN", "PRIORITY_MAX"],
          "description": "Optimization priority"
        },
        "execution_accelerators": {
          "type": "object",
          "description": "Execution accelerator configurations",
          "properties": {
            "gpu_execution_accelerator": {
              "type": "array",
              "items": {
                "type": "object",
                "properties": {
                  "name": { "type": "string" },
                  "parameters": {
                    "type": "object",
                    "additionalProperties": { "type": "string" }
                  }
                }
              }
            },
            "cpu_execution_accelerator": {
              "type": "array",
              "items": {
                "type": "object",
                "properties": {
                  "name": { "type": "string" },
                  "parameters": {
                    "type": "object",
                    "additionalProperties": { "type": "string" }
                  }
                }
              }
            }
          }
        },
        "input_pinned_memory": {
          "type": "object",
          "properties": {
            "enable": { "type": "boolean" }
          }
        },
        "output_pinned_memory": {
          "type": "object",
          "properties": {
            "enable": { "type": "boolean" }
          }
        },
        "gather_kernel_buffer_threshold": {
          "type": "integer",
          "description": "Threshold for using gather kernel for input tensor copy"
        },
        "eager_batching": {
          "type": "boolean",
          "description": "Whether to use eager batching"
        }
      }
    }
  }
}