#!/bin/bash
set -euo pipefail
exec 2>> /sared/slurm/logs/epilog_job_errors.log

# Only process failed jobs
if [[ "${SLURM_JOB_EXIT_CODE:-1}" -ne 0 ]]; then
  echo "[FAIL] Job $SLURM_JOB_ID failed with code $SLURM_JOB_EXIT_CODE"

  # Extract NodeList
  scontrol show job "${SLURM_JOB_ID}" \
    | grep -oP 'NodeList=\K\S+' \
    | grep -v '^(null)$' \
    > "/shared/slurm/failed_jobs/failed_job_${SLURM_JOB_ID}.nodes"
  NODE_LIST=$(cat "/shared/slurm/failed_jobs/failed_job_${SLURM_JOB_ID}.nodes")

  # Trigger recovery orchestration (e.g., via Python script) #
  # Example 1: For scripts that do not require a job ID and check the entire cluster
  /usr/bin/python3 /shared/slurm/scripts/slurm_node_recovery_orchestrator.py
  # Example 2: To check only the nodes associated with the associated job ID (for scripts that require the job ID as an argument)
  /usr/bin/python3 /shared/slurm/scripts/slurm_node_recovery.py --job-id "$SLURM_JOB_ID"
fih