import "@typespec/rest"; import "@azure-tools/typespec-azure-core"; import "@azure-tools/typespec-azure-resource-manager"; import "./aimanager.tsp"; import "./aimanagernamespace.tsp"; import "./aimodel.tsp"; import "./modelsource.tsp"; import "./helpers.tsp"; using TypeSpec.Http; using TypeSpec.Rest; using TypeSpec.Versioning; using Azure.Core; using Azure.ResourceManager; namespace Microsoft.ContainerService; #suppress "@azure-tools/typespec-azure-core/casing-style" "AIModel is a valid name" @added(Versions.v2026_05_02_preview) @doc("The ARM resource id of an AIModel.") scalar AIModelResourceId extends Azure.Core.armResourceIdentifier<[ { type: "Microsoft.ContainerService/aiModels", } ]>; @added(Versions.v2026_05_02_preview) @doc("The ARM resource id of a ModelSource.") scalar ModelSourceResourceId extends Azure.Core.armResourceIdentifier<[ { type: "Microsoft.ContainerService/aiManagers/modelSources", } ]>; // The model reference type. Phase 1 accepts an `AIModel` resource id only. // Future model resource types (e.g. a user-registered BYO model) can be // added by widening the underlying ARM type without a breaking change. @added(Versions.v2026_05_02_preview) @doc("A running deployment of a model in an AI Manager namespace.") @resource("modelDeployments") @parentResource(AIManagerNamespace) model ModelDeployment is ProxyResource { ...ResourceNameParameter< Resource = ModelDeployment, KeyName = "modelDeploymentName", SegmentName = "modelDeployments", NamePattern = "^[a-z0-9]([-a-z0-9]*[a-z0-9])?$" >; ...EntityTagProperty; } @added(Versions.v2026_05_02_preview) @doc("Model deployment properties.") model ModelDeploymentProperties { @visibility(Lifecycle.Read) @doc("The status of the last reconciliation.") provisioningState?: ModelDeploymentProvisioningState; @visibility(Lifecycle.Create, Lifecycle.Read) @doc("Full ARM resource id of the model to deploy. Phase 1 accepts an `AIModel` resource id only. Immutable after creation.") modelResourceId: AIModelResourceId; @visibility(Lifecycle.Create, Lifecycle.Read) @doc("Full ARM resource id of a `ModelSource` to use when pulling artifacts for this deployment. Immutable after creation.") modelSourceResourceId?: ModelSourceResourceId; @doc("Runtime performance mode. Selects a default engine/quantization combination; use `overrides` to pin values.") performanceMode?: ModelDeploymentPerformanceMode = ModelDeploymentPerformanceMode.Balanced; @visibility(Lifecycle.Create, Lifecycle.Read) @doc("Azure VM SKU used to host the deployment, e.g. \"Standard_NC96ads_A100_v4\". Immutable after creation.") vmSize: string; @doc("Desired replica count. Ignored when `autoscaling.enabled` is true.") @minValue(1) replicas?: int32 = 1; @doc("The autoscaling configuration for the deployment.") autoscaling?: AutoscalingProfile; @doc("User overrides layered on top of profile resolution. Replace semantics on PATCH: the entire object is replaced.") overrides?: ModelDeploymentOverrides; @visibility(Lifecycle.Read) @doc("Runtime status, populated once reconciliation begins.") status?: ModelDeploymentStatus; } @added(Versions.v2026_05_02_preview) @doc("The provisioning state of a model deployment resource.") union ModelDeploymentProvisioningState { string, ResourceProvisioningState, @doc("Resource is being created.") Creating: "Creating", @doc("Resource is updating.") Updating: "Updating", @doc("Resource is deleting.") Deleting: "Deleting", } @added(Versions.v2026_05_02_preview) @doc("The runtime performance mode of a model deployment.") union ModelDeploymentPerformanceMode { string, @doc("A balanced trade-off between latency and throughput (default).") Balanced: "Balanced", @doc("Optimize for low request latency.") Latency: "Latency", @doc("Optimize for high aggregate throughput.") Throughput: "Throughput", } @added(Versions.v2026_05_02_preview) @doc("The runtime status of a model deployment. All fields are read-only and populated once reconciliation has started.") model ModelDeploymentStatus { @visibility(Lifecycle.Read) @doc("The inference endpoint URL exposed by the deployment, once ready.") endpoint?: url; @visibility(Lifecycle.Read) @doc("The inference engine used to serve the model, e.g. \"vllm\".") engine?: string; @visibility(Lifecycle.Read) @doc("The version of the inference engine, e.g. \"0.17\".") engineVersion?: string; @visibility(Lifecycle.Read) @doc("The maximum model context length, in tokens, configured for this deployment.") maxModelLen?: int32; @visibility(Lifecycle.Read) @doc("The quantization level applied to the model weights, e.g. \"fp16\", \"awq-int4\".") quantization?: string; @visibility(Lifecycle.Read) @doc("The desired replica count reported by the controller. Equals `properties.replicas` when autoscaler is disabled; current target replica count otherwise.") desiredReplicas?: int32; @visibility(Lifecycle.Read) @doc("The peak tokens per minute measured by live stress test.") peakTokensPerMinute?: int32; @visibility(Lifecycle.Read) @doc("Estimated total time, in seconds, for the deployment to become ready end-to-end (GPU node provisioning, image/weight pull, engine warm-up).") estimatedProvisionTimeSeconds?: int32; } @added(Versions.v2026_05_02_preview) @doc("User overrides for a model deployment.") model ModelDeploymentOverrides { #suppress "@azure-tools/typespec-azure-resource-manager/arm-no-record" "Free-form override key/value pairs." @doc("Free-form override key/value pairs. Recognized keys are documented per release.") values?: Record; } @added(Versions.v2026_05_02_preview) @doc("Autoscaling configuration for a model deployment.") model AutoscalingProfile { @doc("Whether autoscaling is enabled for this deployment.") enabled?: boolean = false; @doc("The minimum number of replicas when autoscaling is enabled.") @minValue(1) minReplicas?: int32 = 1; @doc("The maximum number of replicas when autoscaling is enabled. If not specified, the service derives a default from the subscription GPU quota.") @minValue(1) maxReplicas?: int32; } @added(Versions.v2026_05_02_preview) @doc("The model deployment resource patch model.") model ModelDeploymentPatch { @doc("Mutable properties of the model deployment.") properties?: ModelDeploymentPatchProperties; } @added(Versions.v2026_05_02_preview) @doc("Mutable properties of a model deployment.") model ModelDeploymentPatchProperties { @doc("Runtime performance mode.") performanceMode?: ModelDeploymentPerformanceMode; @doc("Desired replica count. Ignored when `autoscaling.enabled` is true.") @minValue(1) replicas?: int32; @doc("The autoscaling configuration for the deployment.") autoscaling?: AutoscalingProfile; @doc("User overrides layered on top of profile resolution.") overrides?: ModelDeploymentOverrides; } @added(Versions.v2026_05_02_preview) @armResourceOperations interface ModelDeployments { get is ArmResourceRead; createOrUpdate is ArmResourceCreateOrReplaceAsync< ModelDeployment, Azure.ResourceManager.Foundations.BaseParameters & IfMatchParameters & IfNoneMatchParameters >; update is ArmCustomPatchSync< ModelDeployment, ModelDeploymentPatch, Azure.ResourceManager.Foundations.BaseParameters & IfMatchParameters >; delete is ArmResourceDeleteWithoutOkAsync< ModelDeployment, Azure.ResourceManager.Foundations.BaseParameters & IfMatchParameters >; #suppress "@azure-tools/typespec-azure-core/casing-style" "AIManager is a valid name" listByAIManagerNamespace is ArmResourceListByParent; } @@maxLength(ModelDeployment.name, 63); @@minLength(ModelDeployment.name, 1); @@doc(ModelDeployment.name, "The name of the model deployment resource.");