name: Xgboost train
description: |-
  Train an XGBoost model.

      Args:
          training_data_path: Path for the training data in CSV format.
          model_path: Output path for the trained model in binary XGBoost format.
          model_config_path: Output path for the internal parameter configuration of Booster as a JSON string.
          starting_model_path: Path for the existing trained model to start from.
          label_column: Column containing the label data.
          num_boost_rounds: Number of boosting iterations.
          booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html
          objective: The learning task and the corresponding learning objective.
              See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
              The most common values are:
              "reg:squarederror" - Regression with squared loss (default).
              "reg:logistic" - Logistic regression.
              "binary:logistic" - Logistic regression for binary classification, output probability.
              "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation
              "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
              "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized

      Annotations:
          author: Alexey Volkov <alexey.volkov@ark-kun.com>
inputs:
- {name: training_data, type: CSV}
- {name: starting_model, type: XGBoostModel, optional: true}
- {name: label_column, type: Integer, default: '0', optional: true}
- {name: num_iterations, type: Integer, default: '10', optional: true}
- {name: booster_params, type: JsonObject, optional: true}
- {name: objective, type: String, default: 'reg:squarederror', optional: true}
- {name: booster, type: String, default: gbtree, optional: true}
- {name: learning_rate, type: Float, default: '0.3', optional: true}
- {name: min_split_loss, type: Float, default: '0', optional: true}
- {name: max_depth, type: Integer, default: '6', optional: true}
outputs:
- {name: model, type: XGBoostModel}
- {name: model_config, type: XGBoostModelConfig}
implementation:
  container:
    image: python:3.7
    command:
    - sh
    - -c
    - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
      'xgboost==1.1.1' 'pandas==1.0.5' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3
      -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5'
      --user) && "$0" "$@"
    - python3
    - -u
    - -c
    - |
      def _make_parent_dirs_and_return_path(file_path: str):
          import os
          os.makedirs(os.path.dirname(file_path), exist_ok=True)
          return file_path

      def xgboost_train(
          training_data_path,  # Also supports LibSVM
          model_path,
          model_config_path,
          starting_model_path = None,

          label_column = 0,
          num_iterations = 10,
          booster_params = None,

          # Booster parameters
          objective = 'reg:squarederror',
          booster = 'gbtree',
          learning_rate = 0.3,
          min_split_loss = 0,
          max_depth = 6,
      ):
          '''Train an XGBoost model.

          Args:
              training_data_path: Path for the training data in CSV format.
              model_path: Output path for the trained model in binary XGBoost format.
              model_config_path: Output path for the internal parameter configuration of Booster as a JSON string.
              starting_model_path: Path for the existing trained model to start from.
              label_column: Column containing the label data.
              num_boost_rounds: Number of boosting iterations.
              booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html
              objective: The learning task and the corresponding learning objective.
                  See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
                  The most common values are:
                  "reg:squarederror" - Regression with squared loss (default).
                  "reg:logistic" - Logistic regression.
                  "binary:logistic" - Logistic regression for binary classification, output probability.
                  "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation
                  "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
                  "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized

          Annotations:
              author: Alexey Volkov <alexey.volkov@ark-kun.com>
          '''
          import pandas
          import xgboost

          df = pandas.read_csv(
              training_data_path,
          )

          training_data = xgboost.DMatrix(
              data=df.drop(columns=[df.columns[label_column]]),
              label=df[df.columns[label_column]],
          )

          booster_params = booster_params or {}
          booster_params.setdefault('objective', objective)
          booster_params.setdefault('booster', booster)
          booster_params.setdefault('learning_rate', learning_rate)
          booster_params.setdefault('min_split_loss', min_split_loss)
          booster_params.setdefault('max_depth', max_depth)

          starting_model = None
          if starting_model_path:
              starting_model = xgboost.Booster(model_file=starting_model_path)

          model = xgboost.train(
              params=booster_params,
              dtrain=training_data,
              num_boost_round=num_iterations,
              xgb_model=starting_model
          )

          # Saving the model in binary format
          model.save_model(model_path)

          model_config_str = model.save_config()
          with open(model_config_path, 'w') as model_config_file:
              model_config_file.write(model_config_str)

      import json
      import argparse
      _parser = argparse.ArgumentParser(prog='Xgboost train', description='Train an XGBoost model.\n\n    Args:\n        training_data_path: Path for the training data in CSV format.\n        model_path: Output path for the trained model in binary XGBoost format.\n        model_config_path: Output path for the internal parameter configuration of Booster as a JSON string.\n        starting_model_path: Path for the existing trained model to start from.\n        label_column: Column containing the label data.\n        num_boost_rounds: Number of boosting iterations.\n        booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n        objective: The learning task and the corresponding learning objective.\n            See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n            The most common values are:\n            "reg:squarederror" - Regression with squared loss (default).\n            "reg:logistic" - Logistic regression.\n            "binary:logistic" - Logistic regression for binary classification, output probability.\n            "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation\n            "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized\n            "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized\n\n    Annotations:\n        author: Alexey Volkov <alexey.volkov@ark-kun.com>')
      _parser.add_argument("--training-data", dest="training_data_path", type=str, required=True, default=argparse.SUPPRESS)
      _parser.add_argument("--starting-model", dest="starting_model_path", type=str, required=False, default=argparse.SUPPRESS)
      _parser.add_argument("--label-column", dest="label_column", type=int, required=False, default=argparse.SUPPRESS)
      _parser.add_argument("--num-iterations", dest="num_iterations", type=int, required=False, default=argparse.SUPPRESS)
      _parser.add_argument("--booster-params", dest="booster_params", type=json.loads, required=False, default=argparse.SUPPRESS)
      _parser.add_argument("--objective", dest="objective", type=str, required=False, default=argparse.SUPPRESS)
      _parser.add_argument("--booster", dest="booster", type=str, required=False, default=argparse.SUPPRESS)
      _parser.add_argument("--learning-rate", dest="learning_rate", type=float, required=False, default=argparse.SUPPRESS)
      _parser.add_argument("--min-split-loss", dest="min_split_loss", type=float, required=False, default=argparse.SUPPRESS)
      _parser.add_argument("--max-depth", dest="max_depth", type=int, required=False, default=argparse.SUPPRESS)
      _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
      _parser.add_argument("--model-config", dest="model_config_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
      _parsed_args = vars(_parser.parse_args())

      _outputs = xgboost_train(**_parsed_args)
    args:
    - --training-data
    - {inputPath: training_data}
    - if:
        cond: {isPresent: starting_model}
        then:
        - --starting-model
        - {inputPath: starting_model}
    - if:
        cond: {isPresent: label_column}
        then:
        - --label-column
        - {inputValue: label_column}
    - if:
        cond: {isPresent: num_iterations}
        then:
        - --num-iterations
        - {inputValue: num_iterations}
    - if:
        cond: {isPresent: booster_params}
        then:
        - --booster-params
        - {inputValue: booster_params}
    - if:
        cond: {isPresent: objective}
        then:
        - --objective
        - {inputValue: objective}
    - if:
        cond: {isPresent: booster}
        then:
        - --booster
        - {inputValue: booster}
    - if:
        cond: {isPresent: learning_rate}
        then:
        - --learning-rate
        - {inputValue: learning_rate}
    - if:
        cond: {isPresent: min_split_loss}
        then:
        - --min-split-loss
        - {inputValue: min_split_loss}
    - if:
        cond: {isPresent: max_depth}
        then:
        - --max-depth
        - {inputValue: max_depth}
    - --model
    - {outputPath: model}
    - --model-config
    - {outputPath: model_config}