# Copyright 2019 The Kubeflow Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: dataproc_submit_spark_job description: >- Submits a Cloud Dataproc job for running Apache Spark applications on YARN. metadata: labels: add-pod-env: 'true' inputs: - name: project_id description: >- Required. The ID of the Google Cloud Platform project that the cluster belongs to. type: GCPProjectID - name: region description: >- Required. The Cloud Dataproc region in which to handle the request. type: GCPRegion - name: cluster_name description: 'Required. The cluster to run the job.' type: String - name: main_jar_file_uri default: '' description: >- The HCFS URI of the jar file that contains the main class. type: GCSPath - name: main_class default: '' description: >- The name of the driver's main class. The jar file that contains the class must be in the default CLASSPATH or specified in jarFileUris. type: String - name: args default: '' description: >- Optional. The arguments to pass to the driver. Do not include arguments, such as --conf, that can be set as job properties, since a collision may occur that causes an incorrect job submission. type: List - name: spark_job default: '' description: >- Optional. The full payload of a [SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob). type: Dict - name: job default: '' description: >- Optional. The full payload of a [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs). type: Dict - name: wait_interval default: '30' description: >- Optional. The wait seconds between polling the operation. Defaults to 30. type: Integer outputs: - name: job_id description: 'The ID of the created job.' type: String - name: MLPipeline UI metadata type: UI metadata implementation: container: image: gcr.io/ml-pipeline/ml-pipeline-gcp:1.7.0-rc.3 args: [ --ui_metadata_path, {outputPath: MLPipeline UI metadata}, kfp_component.google.dataproc, submit_spark_job, --project_id, {inputValue: project_id}, --region, {inputValue: region}, --cluster_name, {inputValue: cluster_name}, --main_jar_file_uri, {inputValue: main_jar_file_uri}, --main_class, {inputValue: main_class}, --args, {inputValue: args}, --spark_job, {inputValue: spark_job}, --job, {inputValue: job}, --wait_interval, {inputValue: wait_interval}, --job_id_output_path, {outputPath: job_id}, ] env: KFP_POD_NAME: "{{pod.name}}"