{ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://volcano.sh/schemas/job.json", "title": "Volcano Job", "description": "Schema for the Volcano Job (vcjob) custom resource definition. A Volcano Job defines a batch workload with multiple task types, lifecycle policies, gang scheduling requirements, plugin integration, and queue assignment for Kubernetes batch processing.", "type": "object", "required": ["apiVersion", "kind", "metadata", "spec"], "properties": { "apiVersion": { "type": "string", "description": "API version of the Volcano Job resource.", "const": "batch.volcano.sh/v1alpha1" }, "kind": { "type": "string", "description": "Resource kind.", "const": "Job" }, "metadata": { "$ref": "#/$defs/ObjectMeta" }, "spec": { "$ref": "#/$defs/JobSpec" }, "status": { "$ref": "#/$defs/JobStatus" } }, "$defs": { "JobSpec": { "type": "object", "description": "Specification of a Volcano Job defining workload structure, tasks, scheduling requirements, and lifecycle policies.", "properties": { "schedulerName": { "type": "string", "description": "Name of the scheduler responsible for this job. Defaults to 'volcano'.", "default": "volcano" }, "minAvailable": { "type": "integer", "description": "Minimum number of pods that must be schedulable simultaneously for gang scheduling. If fewer pods are available, none are scheduled.", "minimum": 0 }, "tasks": { "type": "array", "description": "List of task groups composing the job. Each task defines a pod template, replica count, and task-level policies.", "items": { "$ref": "#/$defs/TaskSpec" } }, "policies": { "type": "array", "description": "Job-level lifecycle policies controlling how the job reacts to events like pod failures or completions.", "items": { "$ref": "#/$defs/LifecyclePolicy" } }, "plugins": { "type": "object", "description": "Map of plugin names to argument arrays. Plugins inject environment variables and sidecar containers for ML frameworks. Common plugins include 'tensorflow', 'pytorch', 'mpi', 'svc', and 'env'.", "additionalProperties": { "type": "array", "items": { "type": "string" } }, "examples": [ {"svc": [], "env": []}, {"pytorch": ["--master=master", "--worker=worker"]}, {"mpi": ["--master=mpimaster", "--worker=mpiworker"]} ] }, "queue": { "type": "string", "description": "Name of the Volcano Queue to submit this job to. Determines scheduling priority and resource quota constraints." }, "priorityClassName": { "type": "string", "description": "Kubernetes PriorityClass name controlling this job's scheduling priority." }, "maxRetry": { "type": "integer", "description": "Maximum number of retry attempts before the job is permanently failed.", "minimum": 0 }, "ttlSecondsAfterFinished": { "type": "integer", "description": "Seconds after completion before the job is automatically garbage-collected. Omit to disable automatic cleanup.", "minimum": 0 }, "runningDuration": { "type": "string", "description": "Maximum allowed running duration for the job (e.g. '2h', '30m'). Job is terminated if it exceeds this duration.", "examples": ["1h", "30m", "2h30m"] }, "volumes": { "type": "array", "description": "Volumes to be mounted into job task containers.", "items": { "type": "object" } } } }, "TaskSpec": { "type": "object", "description": "A named task group within a Volcano Job, defining a set of pods with a shared pod template and optional task-level policies.", "required": ["name", "replicas", "template"], "properties": { "name": { "type": "string", "description": "Unique name of the task within the job. Used to reference the task in plugins and dependencies.", "maxLength": 63, "pattern": "^[a-z0-9][a-z0-9-]*[a-z0-9]$" }, "replicas": { "type": "integer", "description": "Number of pod replicas for this task.", "minimum": 1 }, "template": { "type": "object", "description": "Kubernetes pod template spec for pods in this task." }, "policies": { "type": "array", "description": "Lifecycle policies for this task, overriding job-level policies for events affecting this task's pods.", "items": { "$ref": "#/$defs/LifecyclePolicy" } }, "topologyPolicy": { "type": "string", "description": "NUMA topology policy for resource allocation. Controls how CPU and memory are assigned relative to NUMA nodes.", "enum": ["none", "best-effort", "restricted", "single-numa-node"] }, "maxRetry": { "type": "integer", "description": "Maximum retry attempts for this task before it is marked as failed.", "minimum": 0 }, "dependsOn": { "type": "object", "description": "Task dependency configuration controlling execution order within the job.", "properties": { "name": { "type": "array", "description": "Names of tasks that must successfully complete before this task starts.", "items": { "type": "string" } }, "iteration": { "type": "string", "description": "Dependency evaluation strategy.", "enum": ["any", "all"] } } } } }, "LifecyclePolicy": { "type": "object", "description": "A lifecycle policy rule defining an automated action to take when a specific event occurs during job or task execution.", "properties": { "action": { "type": "string", "description": "Action to execute when the policy condition is met.", "enum": [ "AbortJob", "RestartJob", "RestartTask", "TerminateJob", "CompleteJob", "ResumeJob", "SyncJob", "EnqueueJob" ] }, "event": { "type": "string", "description": "Single event type that triggers this policy.", "enum": [ "PodFailed", "PodEvicted", "PodPending", "PodRunning", "PodSucceeded", "TaskCompleted", "AnyEvent", "CommandIssued", "JobUnknown", "JobUpdated", "OutOfSync" ] }, "events": { "type": "array", "description": "Multiple event types that each independently trigger this policy.", "items": { "type": "string" } }, "exitCode": { "type": "integer", "description": "Container exit code that triggers this policy when a pod exits with this code." }, "timeout": { "type": "string", "description": "Duration after which the policy fires if the triggering condition persists.", "examples": ["30s", "5m", "1h"] } } }, "JobStatus": { "type": "object", "description": "Observed status of a Volcano Job including its lifecycle phase, retry count, and per-task pod counts.", "properties": { "state": { "type": "object", "description": "Current phase and transition details of the job.", "properties": { "phase": { "type": "string", "description": "Current lifecycle phase of the job.", "enum": [ "Pending", "Aborting", "Aborted", "Running", "Restarting", "Completing", "Completed", "Terminating", "Terminated", "Failed" ] }, "reason": { "type": "string", "description": "Machine-readable reason for the current phase." }, "message": { "type": "string", "description": "Human-readable description of the current state." }, "lastTransitionTime": { "type": "string", "format": "date-time", "description": "Timestamp of the most recent phase transition." } } }, "minAvailable": { "type": "integer", "description": "Minimum pod count required for this job." }, "retryCount": { "type": "integer", "format": "int32", "description": "Number of times the job has been retried so far." }, "runningDuration": { "type": "string", "description": "Duration the job has been in the Running phase." }, "taskStatusCount": { "type": "object", "description": "Per-task breakdown of pod counts by Kubernetes pod phase.", "additionalProperties": { "type": "object", "properties": { "phase": { "type": "object", "description": "Map of Kubernetes pod phase names to pod counts.", "additionalProperties": { "type": "integer" } } } } }, "conditions": { "type": "array", "description": "Detailed conditions describing the current state of the job.", "items": { "type": "object", "properties": { "type": { "type": "string" }, "status": { "type": "string", "enum": ["True", "False", "Unknown"] }, "transitionID": { "type": "string" }, "lastTransitionTime": { "type": "string", "format": "date-time" }, "reason": { "type": "string" }, "message": { "type": "string" } } } } } }, "ObjectMeta": { "type": "object", "description": "Standard Kubernetes object metadata.", "required": ["name"], "properties": { "name": { "type": "string", "description": "Name of the job, unique within its namespace.", "maxLength": 253, "pattern": "^[a-z0-9][a-z0-9.-]*[a-z0-9]$" }, "namespace": { "type": "string", "description": "Namespace the job belongs to." }, "labels": { "type": "object", "description": "Labels for organizing and selecting the job.", "additionalProperties": { "type": "string" } }, "annotations": { "type": "object", "description": "Non-identifying metadata for the job.", "additionalProperties": { "type": "string" } } } } }, "examples": [ { "apiVersion": "batch.volcano.sh/v1alpha1", "kind": "Job", "metadata": { "name": "pytorch-training", "namespace": "default" }, "spec": { "minAvailable": 3, "schedulerName": "volcano", "queue": "training", "plugins": { "pytorch": ["--master=master", "--worker=worker"], "svc": [], "env": [] }, "policies": [ { "event": "PodEvicted", "action": "RestartJob" } ], "maxRetry": 3, "tasks": [ { "name": "master", "replicas": 1, "template": { "spec": { "containers": [ { "name": "pytorch", "image": "pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime", "resources": { "requests": {"cpu": "4", "memory": "8Gi"}, "limits": {"cpu": "4", "memory": "8Gi", "nvidia.com/gpu": "1"} } } ] } } }, { "name": "worker", "replicas": 2, "template": { "spec": { "containers": [ { "name": "pytorch", "image": "pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime", "resources": { "requests": {"cpu": "4", "memory": "8Gi"}, "limits": {"cpu": "4", "memory": "8Gi", "nvidia.com/gpu": "2"} } } ] } } } ] } } ] }