--- apiVersion: kubeflow.org/v1beta1 kind: Experiment metadata: namespace: kubeflow name: pytorchjob-mnist spec: parallelTrialCount: 3 maxTrialCount: 12 maxFailedTrialCount: 3 objective: type: minimize goal: 0.001 objectiveMetricName: loss algorithm: algorithmName: random parameters: - name: lr parameterType: double feasibleSpace: min: "0.01" max: "0.05" - name: momentum parameterType: double feasibleSpace: min: "0.5" max: "0.9" trialTemplate: primaryContainerName: pytorch trialParameters: - name: learningRate description: Learning rate for the training model reference: lr - name: momentum description: Momentum for the training model reference: momentum trialSpec: apiVersion: kubeflow.org/v1 kind: PyTorchJob spec: pytorchReplicaSpecs: Master: replicas: 1 restartPolicy: OnFailure template: spec: containers: - name: pytorch image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:latest command: - "python3" - "/opt/pytorch-mnist/mnist.py" - "--epochs=1" - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - "--momentum=${trialParameters.momentum}" Worker: replicas: 1 restartPolicy: OnFailure template: spec: containers: - name: pytorch image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:latest command: - "python3" - "/opt/pytorch-mnist/mnist.py" - "--epochs=1" - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - "--momentum=${trialParameters.momentum}"