name: Xgboost predict description: |- Make predictions using a trained XGBoost model. Args: data_path: Path for the feature data in Apache Parquet format. model_path: Path for the trained model in binary XGBoost format. predictions_path: Output path for the predictions. label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction. Annotations: author: Alexey Volkov inputs: - {name: data, type: ApacheParquet} - {name: model, type: XGBoostModel} - {name: label_column_name, type: String, optional: true} outputs: - {name: predictions, type: Text} implementation: container: image: python:3.7 command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' --user) && "$0" "$@" - python3 - -u - -c - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def xgboost_predict( data_path, model_path, predictions_path, label_column_name = None, ): '''Make predictions using a trained XGBoost model. Args: data_path: Path for the feature data in Apache Parquet format. model_path: Path for the trained model in binary XGBoost format. predictions_path: Output path for the predictions. label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction. Annotations: author: Alexey Volkov ''' from pathlib import Path import numpy import pandas import xgboost # Loading data df = pandas.read_parquet(data_path) if label_column_name: df = df.drop(columns=[label_column_name]) evaluation_data = xgboost.DMatrix( data=df, ) # Training model = xgboost.Booster(model_file=model_path) predictions = model.predict(evaluation_data) Path(predictions_path).parent.mkdir(parents=True, exist_ok=True) numpy.savetxt(predictions_path, predictions) import argparse _parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make predictions using a trained XGBoost model.\n\n Args:\n data_path: Path for the feature data in Apache Parquet format.\n model_path: Path for the trained model in binary XGBoost format.\n predictions_path: Output path for the predictions.\n label_column_name: Optional. Name of the column containing the label data that is excluded during the prediction.\n\n Annotations:\n author: Alexey Volkov ') _parser.add_argument("--data", dest="data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--label-column-name", dest="label_column_name", type=str, required=False, default=argparse.SUPPRESS) _parser.add_argument("--predictions", dest="predictions_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = xgboost_predict(**_parsed_args) args: - --data - {inputPath: data} - --model - {inputPath: model} - if: cond: {isPresent: label_column_name} then: - --label-column-name - {inputValue: label_column_name} - --predictions - {outputPath: predictions}