#!/usr/bin/env bash set -o nounset set -x K=kubectl if ! $K version > /dev/null; then K=oc if ! $K version > /dev/null; then echo "FATAL: neither 'kubectl' nor 'oc' appear to be working properly. Exiting ..." exit 1 fi fi if [[ "$0" == "/usr/bin/gather" ]]; then echo "Running as must-gather plugin image" export ARTIFACT_DIR=/must-gather else if [ -z "${ARTIFACT_DIR:-}" ]; then export ARTIFACT_DIR="/tmp/nvidia-gpu-operator_$(date +%Y%m%d_%H%M)" fi echo "Using ARTIFACT_DIR=${ARTIFACT_DIR}" fi mkdir -p "${ARTIFACT_DIR}" echo exec 1> >(tee "${ARTIFACT_DIR}/must-gather.log") exec 2> "${ARTIFACT_DIR}/must-gather.stderr.log" if [[ "$0" == "/usr/bin/gather" ]]; then echo "NVIDIA GPU Operator" > "${ARTIFACT_DIR}/version" echo "${VERSION:-N/A}" >> "${ARTIFACT_DIR}/version" fi ocp_cluster=$($K get clusterversion/version --ignore-not-found -oname || true) if [[ "$ocp_cluster" ]]; then echo "Running in OpenShift." echo "Get the cluster version" $K get clusterversion/version -oyaml > "${ARTIFACT_DIR}/openshift_version.yaml" fi echo echo "#" echo "# KubeVirt HyperConverged Resources" echo "#" echo HYPERCONVERGED_RESOURCE=$($K get hyperconvergeds.hco.kubevirt.io -A -oname --ignore-not-found) if [[ "$HYPERCONVERGED_RESOURCE" ]]; then echo "Get HyperConverged YAML" $K get hyperconvergeds.hco.kubevirt.io -A -oyaml > $ARTIFACT_DIR/hyperconverged.yaml else echo "HyperConverged resource(s) not found in the cluster." fi echo "Get the operator namespaces" OPERATOR_POD_NAME=$($K get pods -lapp=gpu-operator -oname -A) if [ -z "$OPERATOR_POD_NAME" ]; then echo "FATAL: could not find the GPU Operator Pod ..." exit 1 fi OPERATOR_NAMESPACE=$($K get pods -lapp=gpu-operator -A -ojsonpath='{.items[].metadata.namespace}' --ignore-not-found) echo "Using '$OPERATOR_NAMESPACE' as operator namespace" echo "" echo echo "#" echo "# KubeVirt Resources" echo "#" echo KUBEVIRT_RESOURCE=$($K get kubevirts.kubevirt.io -A -oname --ignore-not-found) if [[ "$KUBEVIRT_RESOURCE" ]]; then echo "Get KubeVirt YAML" $K get kubevirts.kubevirt.io -A -oyaml > $ARTIFACT_DIR/kubevirt.yaml else echo "KubeVirt resource(s) not found in the cluster." fi echo "#" echo "# ClusterPolicy" echo "#" echo CLUSTER_POLICY_NAME=$($K get clusterpolicies.nvidia.com -oname) if [[ "${CLUSTER_POLICY_NAME}" ]]; then echo "Get ${CLUSTER_POLICY_NAME}" $K get -oyaml "${CLUSTER_POLICY_NAME}" > "${ARTIFACT_DIR}/cluster_policy.yaml" else echo "Mark the ClusterPolicy as missing" touch "${ARTIFACT_DIR}/cluster_policy.missing" fi echo echo "#" echo "# Nodes and machines" echo "#" echo if [ "$ocp_cluster" ]; then echo "Get all the machines" $K get machines -A > "${ARTIFACT_DIR}/all_machines.list" fi echo "Get the labels of the nodes with NVIDIA PCI cards" GPU_PCI_LABELS=(feature.node.kubernetes.io/pci-10de.present feature.node.kubernetes.io/pci-0302_10de.present feature.node.kubernetes.io/pci-0300_10de.present) gpu_pci_nodes="" for label in "${GPU_PCI_LABELS[@]}"; do gpu_pci_nodes="$gpu_pci_nodes $($K get nodes -l$label -oname)" done if [ -z "$gpu_pci_nodes" ]; then echo "FATAL: could not find nodes with NVIDIA PCI labels" exit 0 fi for node in $(echo "$gpu_pci_nodes"); do echo "${node}" | cut -d/ -f2 >> "${ARTIFACT_DIR}/gpu_nodes.labels" $K get "${node}" '-ojsonpath={.metadata.labels}' \ | sed 's|,|,- |g' \ | tr ',' '\n' \ | sed 's/{"/- /' \ | tr : = \ | sed 's/"//g' \ | sed 's/}/\n/' \ >> "${ARTIFACT_DIR}/gpu_nodes.labels" echo "" >> "${ARTIFACT_DIR}/gpu_nodes.labels" done echo "Get the GPU nodes (status)" $K get nodes -l nvidia.com/gpu.present=true -o wide > "${ARTIFACT_DIR}/gpu_nodes.status" echo "Get the GPU nodes (description)" $K describe nodes -l nvidia.com/gpu.present=true > "${ARTIFACT_DIR}/gpu_nodes.descr" echo "" echo "#" echo "# Operator Pod" echo "#" echo echo "Get the GPU Operator Pod (status)" $K get "${OPERATOR_POD_NAME}" \ -owide \ -n "${OPERATOR_NAMESPACE}" \ > "${ARTIFACT_DIR}/gpu_operator_pod.status" echo "Get the GPU Operator Pod (yaml)" $K get "${OPERATOR_POD_NAME}" \ -oyaml \ -n "${OPERATOR_NAMESPACE}" \ > "${ARTIFACT_DIR}/gpu_operator_pod.yaml" echo "Get the GPU Operator Pod logs" $K logs "${OPERATOR_POD_NAME}" \ -n "${OPERATOR_NAMESPACE}" \ > "${ARTIFACT_DIR}/gpu_operator_pod.log" $K logs "${OPERATOR_POD_NAME}" \ -n "${OPERATOR_NAMESPACE}" \ --previous \ > "${ARTIFACT_DIR}/gpu_operator_pod.previous.log" echo "" echo "#" echo "# Operand Pods" echo "#" echo "" echo "Get the Pods in ${OPERATOR_NAMESPACE} (status)" $K get pods -owide \ -n "${OPERATOR_NAMESPACE}" \ > "${ARTIFACT_DIR}/gpu_operand_pods.status" echo "Get the Pods in ${OPERATOR_NAMESPACE} (yaml)" $K get pods -oyaml \ -n "${OPERATOR_NAMESPACE}" \ > "${ARTIFACT_DIR}/gpu_operand_pods.yaml" echo "Get the GPU Operator Pods Images" $K get pods -n "${OPERATOR_NAMESPACE}" \ -o=jsonpath='{range .items[*]}{"\n"}{.metadata.name}{":\t"}{range .spec.containers[*]}{.image}{" "}{end}{end}' \ > "${ARTIFACT_DIR}/gpu_operand_pod_images.txt" echo "Get the description and logs of the GPU Operator Pods" for pod in $($K get pods -n "${OPERATOR_NAMESPACE}" -oname); do if ! $K get "${pod}" -n "${OPERATOR_NAMESPACE}" -ojsonpath='{.metadata.labels}' | grep -E --quiet '(nvidia|gpu)'; then echo "Skipping $pod, not a NVIDA/GPU Pod ..." continue fi pod_name=$(echo "$pod" | cut -d/ -f2) if [ "${pod}" == "${OPERATOR_POD_NAME}" ]; then echo "Skipping operator pod $pod_name ..." continue fi $K logs "${pod}" \ -n "${OPERATOR_NAMESPACE}" \ --all-containers --prefix \ > "${ARTIFACT_DIR}/gpu_operand_pod_$pod_name.log" $K logs "${pod}" \ -n "${OPERATOR_NAMESPACE}" \ --all-containers --prefix \ --previous \ > "${ARTIFACT_DIR}/gpu_operand_pod_$pod_name.previous.log" $K describe "${pod}" \ -n "${OPERATOR_NAMESPACE}" \ > "${ARTIFACT_DIR}/gpu_operand_pod_$pod_name.descr" done echo "" echo "#" echo "# Operand DaemonSets" echo "#" echo "" echo "Get the DaemonSets in $OPERATOR_NAMESPACE (status)" $K get ds \ -n "${OPERATOR_NAMESPACE}" \ > "${ARTIFACT_DIR}/gpu_operand_ds.status" echo "Get the DaemonSets in $OPERATOR_NAMESPACE (yaml)" $K get ds -oyaml \ -n "${OPERATOR_NAMESPACE}" \ > "${ARTIFACT_DIR}/gpu_operand_ds.yaml" echo "Get the description of the GPU Operator DaemonSets" for ds in $($K get ds -n "${OPERATOR_NAMESPACE}" -oname); do if ! $K get "${ds}" -n "${OPERATOR_NAMESPACE}" -ojsonpath='{.metadata.labels}' | grep -E --quiet '(nvidia|gpu)'; then echo "Skipping ${ds}, not a NVIDA/GPU DaemonSet ..." continue fi $K describe "${ds}" \ -n "${OPERATOR_NAMESPACE}" \ > "${ARTIFACT_DIR}/gpu_operand_ds_$(echo "$ds" | cut -d/ -f2).descr" done echo "" echo "#" echo "# nvidia-bug-report.sh" echo "#" echo "" for pod in $($K get pods -lopenshift.driver-toolkit -oname -n "${OPERATOR_NAMESPACE}"; $K get pods -lapp=nvidia-driver-daemonset -oname -n "${OPERATOR_NAMESPACE}"; $K get pods -lapp=nvidia-vgpu-manager-daemonset -oname -n "${OPERATOR_NAMESPACE}"); do pod_nodename=$($K get "${pod}" -ojsonpath={.spec.nodeName} -n "${OPERATOR_NAMESPACE}") echo "Saving nvidia-bug-report from ${pod_nodename} ..." $K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- bash -c 'cd /tmp && nvidia-bug-report.sh' >&2 || \ (echo "Failed to collect nvidia-bug-report from ${pod_nodename}" && continue) $K cp "${OPERATOR_NAMESPACE}"/$(basename "${pod}"):/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz || \ (echo "Failed to save nvidia-bug-report from ${pod_nodename}" && continue) mv /tmp/nvidia-bug-report.log.gz "${ARTIFACT_DIR}/nvidia-bug-report_${pod_nodename}.log.gz" done echo "" echo "#" echo "# All done!" if [[ "$0" != "/usr/bin/gather" ]]; then echo "# Logs saved into ${ARTIFACT_DIR}." fi echo "#"