name: Transform using TF on Dataflow
description: Runs TensorFlow Transform on Google Cloud Dataflow
inputs:
  - {name: Training data file pattern,    type: GCSPath,    description: 'GCS path of train file patterns.'} #Also supports local CSV # type: {GCSPath: {data_type: CSV}}
  - {name: Evaluation data file pattern,  type: GCSPath,    description: 'GCS path of eval file patterns.'} #Also supports local CSV # type: {GCSPath: {data_type: CSV}}
  - {name: Schema,                        type: GCSPath,   description: 'GCS json schema file path.'} # type: {GCSPath: {data_type: JSON}}
  - {name: GCP project,                   type: GCPProjectID,                     description: 'The GCP project to run the dataflow job.'}
  - {name: Run mode,                      type: String, default: local,         description: 'Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud".' }
  - {name: Preprocessing module,          type: GCSPath, default: '', description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions.'} # type: {GCSPath: {data_type: Python}}
  - {name: Transformed data dir,          type: GCSPath, description: 'GCS or local directory'} #Also supports local paths # type: {GCSPath: {path_type: Directory}}
outputs:
  - {name: Transformed data dir,          type: GCSPath} # type: {GCSPath: {path_type: Directory}}
implementation:
  container:
    image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:57d9f7f1cfd458e945d297957621716062d89a49
    command: [python2, /ml/transform.py]
    args: [
      --train,   {inputValue: Training data file pattern},
      --eval,    {inputValue: Evaluation data file pattern},
      --schema,  {inputValue: Schema},
      --project, {inputValue: GCP project},
      --mode,    {inputValue: Run mode},
      --preprocessing-module, {inputValue: Preprocessing module},
      --output,  {inputValue: Transformed data dir},
    ]
    fileOutputs:
      Transformed data dir: /output.txt