import "@typespec/openapi"; import "../common/models.tsp"; import "../servicepatterns.tsp"; import "../openai-evaluations/models.tsp"; import "../evaluations/models.tsp"; using TypeSpec.Rest; using OpenAPI; namespace Azure.AI.Projects; // --------------------------------------------------------------------------- // Enums // --------------------------------------------------------------------------- @doc("The subtype of an evaluation suite.") union EvaluationSuiteSubtype { string, @doc("Default suite type.") default: "default", @doc("Benchmark suite.") benchmark: "benchmark", } @doc("The category of evaluator generation.") union EvaluationSuiteGenerationCategory { string, @doc("Quality-focused rubric criteria.") quality: "quality", @doc("Safety-focused policy criteria.") safety: "safety", } @doc("The data generation type.") union DataGenerationType { string, @doc("Simple question and answer generation.") simple_qna: "simple_qna", @doc("Traces-based generation.") traces: "traces", @doc("Tool use generation.") tool_use: "tool_use", @doc("Task-based generation.") task: "task", } // --------------------------------------------------------------------------- // Child models // --------------------------------------------------------------------------- @doc(""" Reference to a dataset by name and version. """) model DatasetReference { @doc("Dataset name.") name: string; @doc("Dataset version. If not provided, resolves to the latest version.") version?: string; @doc(""" Name of the schema file within the dataset's blob folder (e.g., "a3f2b1c4_schema.json"). Optional — if not provided, schema is inferred at runtime from the data. Only applicable for uri_folder datasets. """) schema_file_name?: string; } // --------------------------------------------------------------------------- // CRUD Resource model // --------------------------------------------------------------------------- @doc(""" An evaluation suite bundles testing criteria — an optional dataset, one or more evaluator configs with thresholds and init params — into a reusable, named artifact that can gate agent changes across batch, scheduled, continuous, and CI/CD evals. """) @resource("evaluation_suites") model EvaluationSuiteVersion { @doc(""" Human-readable display name. Does not need to be unique. Shown in Foundry portal list views and eval reports. """) display_name?: string; @doc(""" Subtype of the evaluation suite. """) subtype?: EvaluationSuiteSubtype; @doc(""" Dataset reference for evaluation. Optional — omit for evaluator-only suites where data comes from live production traces or is provided at run time. The referenced dataset must exist in the project's dataset registry. """) dataset?: DatasetReference; @doc(""" Testing criteria — the evaluator configurations for this suite. Supports all grader types: azure_ai_evaluator, string_check, label_model, score_model, text_similarity, python, etc. At least one entry is required. """) @minItems(1) #suppress "@azure-tools/typespec-azure-core/no-unnamed-union" "Matching eval group testing_criteria pattern" testing_criteria: ( | OpenAI.EvalGraderLabelModel | OpenAI.EvalGraderStringCheck | OpenAI.EvalGraderTextSimilarity | OpenAI.EvalGraderPython | OpenAI.EvalGraderScoreModel | TestingCriterionAzureAIEvaluator )[]; @doc(""" Target for this evaluation suite. Uses the existing Target discriminated type from eval runs. Supports azure_ai_agent, azure_ai_model, azure_ai_assistant. Optional — allows suites to exist without a target. """) target?: Target; @doc(""" How to send dataset rows to the target (agent or model). Supports template type (prompt with column placeholders) and item_reference type (column containing pre-built messages). """) #suppress "@azure-tools/typespec-azure-core/no-unnamed-union" "Supporting both input message types" input_messages?: OpenAI.CreateEvalResponsesRunDataSourceInputMessagesTemplate | OpenAI.CreateEvalCompletionsRunDataSourceInputMessagesItemReference; @doc(""" Default evaluation level for this suite. Can be overridden at run time. """) evaluation_level?: EvaluationLevel; @doc("The name of the resource.") @visibility(Lifecycle.Read) @key name: string; @doc("The version of the resource.") @visibility(Lifecycle.Read) version: string; @doc("The asset description text.") @visibility(Lifecycle.Create, Lifecycle.Update) description?: string; @doc("Tag dictionary. Tags can be added, removed, and updated.") @visibility(Lifecycle.Create, Lifecycle.Update) tags?: Record; } // --------------------------------------------------------------------------- // Run API models // --------------------------------------------------------------------------- @doc("Request body for running an evaluation from a suite.") model EvaluationSuiteRunRequest { @doc("Name for the evaluation. Default: '{suiteName}-runs'.") evaluation_name?: string; @doc("Evaluation suite version to run. Default: latest.") evaluation_suite_version?: string; @doc("Overrides the suite's default evaluation level. If omitted, uses the level from the suite.") evaluation_level?: EvaluationLevel; } @doc("Response from running an evaluation suite.") model EvaluationSuiteRunResponse { @doc("The evaluation suite name used.") evaluation_suite_name: string; @doc("The evaluation suite version resolved.") evaluation_suite_version: string; @doc("The run results. Currently a single-element array; will support multiple runs in the future.") results: EvaluationSuiteRunResult[]; } @doc("Result of a single evaluation run within a suite execution.") model EvaluationSuiteRunResult { @doc("The evaluation ID created.") eval_id: string; @doc("The eval run ID created.") run_id: string; @doc("Status of the run.") status: JobStatus; @doc("Timestamp when the run was created.") created_at: FoundryTimestamp; } // --------------------------------------------------------------------------- // Generate API (LRO) models // --------------------------------------------------------------------------- // Source types use shared JobSource shapes from common/models.tsp via spread, // with an evaluation-suite-specific discriminated base (same pattern as // DataGenerationJobSource). @doc("The supported source types for evaluation suite generation jobs.") union EvaluationSuiteJobSourceType { string, @doc("Prompt source — inline text provided by the user.") prompt: "prompt", @doc("Agent source — references an agent.") agent: "agent", @doc("Traces source — conversation traces from Application Insights.") traces: "traces", @doc("Dataset source — reference to a dataset.") dataset: "dataset", } @doc("The base source model for evaluation suite generation jobs. Polymorphic over `type`.") @discriminator("type") model EvaluationSuiteJobSource { @doc("The type of source.") type: EvaluationSuiteJobSourceType; ...JobSourceDescription; } @doc("Prompt source for evaluation suite generation jobs — inline text provided by the user.") model PromptEvaluationSuiteJobSource extends EvaluationSuiteJobSource { ...PromptJobSource; } @doc("Agent source for evaluation suite generation jobs — references an agent to fetch instructions and metadata from.") model AgentEvaluationSuiteJobSource extends EvaluationSuiteJobSource { ...AgentJobSource; } @doc("Traces source for evaluation suite generation jobs — conversation traces from Application Insights.") model TracesEvaluationSuiteJobSource extends EvaluationSuiteJobSource { ...TracesJobSource; } @doc("Dataset source for evaluation suite generation jobs — reference to a dataset.") model DatasetEvaluationSuiteJobSource extends EvaluationSuiteJobSource { ...DatasetJobSource; } @doc("Caller-supplied inputs for an evaluation suite generation job.") model EvaluationSuiteGenerationJobInputs { @doc("The evaluation suite name to create.") evaluation_suite_name: string; @doc("Source materials for generation — agent context, prompts, traces, or datasets.") sources: EvaluationSuiteJobSource[]; @doc("The LLM model to use for rubric and data generation (e.g., 'gpt-4o').") generation_model: string; @doc("Category determines the generation focus. Default: quality.") category?: EvaluationSuiteGenerationCategory = EvaluationSuiteGenerationCategory.quality; @doc(""" Optional initialization parameters applied to all generated evaluators. For example, deployment_name for LLM judge model, default threshold. """) initialization_parameters?: Record; @doc(""" Data generation options. Controls how the evaluation dataset is generated. If omitted, defaults are used (simple_qna, 100 max_samples). """) data_generation_options?: EvaluationSuiteDataGenerationOptions; } @doc("Options for dataset generation within an evaluation suite generation job.") model EvaluationSuiteDataGenerationOptions { @doc("The data generation type. Defaults to 'simple_qna' if not specified.") type?: DataGenerationType; @doc("Maximum number of samples to generate. Valid range: 15-1000.") @minValue(15) @maxValue(1000) max_samples?: int32; } @doc("Evaluation suite generation job resource — a long-running job that generates testing criteria and optionally a dataset from source materials. On success, the result is the persisted EvaluationSuiteVersion.") model EvaluationSuiteGenerationJob is JobLike { @doc("The timestamp when the job was created, represented in Unix time.") @visibility(Lifecycle.Read) created_at: FoundryTimestamp; @doc("The timestamp when the job finished, represented in Unix time.") @visibility(Lifecycle.Read) finished_at?: FoundryTimestamp; @doc("Token consumption summary. Populated on terminal states.") @visibility(Lifecycle.Read) usage?: EvaluationSuiteGenerationTokenUsage; } @doc("Token usage summary for an evaluation suite generation job.") model EvaluationSuiteGenerationTokenUsage { @doc("Number of input tokens consumed.") input_tokens?: int64; @doc("Number of output tokens consumed.") output_tokens?: int64; @doc("Total tokens consumed.") total_tokens?: int64; }