name: 'Sagemaker - Training Job' description: | Train Machine Learning and Deep Learning Models using SageMaker inputs: - name: region description: 'The region where the training job launches.' - name: job_name description: 'The name of the batch training job.' default: '' - name: role description: 'The Amazon Resource Name (ARN) that Amazon SageMaker assumes to perform tasks on your behalf.' - name: image description: 'The registry path of the Docker image that contains the training algorithm.' default: '' - name: algorithm_name description: 'The name of the algorithm resource to use for the training job. Do not specify a value for this if using training image.' default: '' - name: metric_definitions description: 'The dictionary of name-regex pairs specify the metrics that the algorithm emits.' default: '{}' - name: training_input_mode description: 'The input mode that the algorithm supports. File or Pipe.' default: 'File' - name: hyperparameters description: 'Dictionary of hyperparameters for the the algorithm.' default: '{}' - name: channels description: 'A list of dicts specifying the input channels. Must have at least one.' - name: instance_type description: 'The ML compute instance type.' default: 'ml.m4.xlarge' - name: instance_count description: 'The number of ML compute instances to use in each training job.' default: '1' - name: volume_size description: 'The size of the ML storage volume that you want to provision.' default: '30' - name: resource_encryption_key description: 'The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).' default: '' - name: max_run_time description: 'The maximum run time in seconds for the training job.' default: '86400' - name: model_artifact_path description: 'Identifies the S3 path where you want Amazon SageMaker to store the model artifacts.' - name: output_encryption_key description: 'The AWS KMS key that Amazon SageMaker uses to encrypt the model artifacts.' default: '' - name: vpc_security_group_ids description: 'The VPC security group IDs, in the form sg-xxxxxxxx.' default: '' - name: vpc_subnets description: 'The ID of the subnets in the VPC to which you want to connect your hpo job.' default: '' - name: network_isolation description: 'Isolates the training container.' default: 'True' - name: traffic_encryption description: 'Encrypts all communications between ML compute instances in distributed training.' default: 'False' - name: spot_instance description: 'Use managed spot training.' default: 'False' - name: max_wait_time description: 'The maximum time in seconds you are willing to wait for a managed spot training job to complete.' default: '86400' - name: checkpoint_config description: 'Dictionary of information about the output location for managed spot training checkpoint data.' default: '{}' - name: endpoint_url description: 'The endpoint URL for the private link VPC endpoint.' default: '' - name: tags description: 'Key-value pairs, to categorize AWS resources.' default: '{}' outputs: - {name: model_artifact_url, description: 'Model artifacts url'} - {name: job_name, description: 'Training job name'} - {name: training_image, description: 'The registry path of the Docker image that contains the training algorithm'} implementation: container: image: amazon/aws-sagemaker-kfp-components:0.3.0 command: ['python'] args: [ train.py, --region, {inputValue: region}, --endpoint_url, {inputValue: endpoint_url}, --job_name, {inputValue: job_name}, --role, {inputValue: role}, --image, {inputValue: image}, --algorithm_name, {inputValue: algorithm_name}, --metric_definitions, {inputValue: metric_definitions}, --training_input_mode, {inputValue: training_input_mode}, --hyperparameters, {inputValue: hyperparameters}, --channels, {inputValue: channels}, --instance_type, {inputValue: instance_type}, --instance_count, {inputValue: instance_count}, --volume_size, {inputValue: volume_size}, --resource_encryption_key, {inputValue: resource_encryption_key}, --max_run_time, {inputValue: max_run_time}, --model_artifact_path, {inputValue: model_artifact_path}, --output_encryption_key, {inputValue: output_encryption_key}, --vpc_security_group_ids, {inputValue: vpc_security_group_ids}, --vpc_subnets, {inputValue: vpc_subnets}, --network_isolation, {inputValue: network_isolation}, --traffic_encryption, {inputValue: traffic_encryption}, --spot_instance, {inputValue: spot_instance}, --max_wait_time, {inputValue: max_wait_time}, --checkpoint_config, {inputValue: checkpoint_config}, --tags, {inputValue: tags} ] fileOutputs: model_artifact_url: /tmp/model_artifact_url.txt job_name: /tmp/job_name.txt training_image: /tmp/training_image.txt