{ "name": "Kubernetes", "description": "Folder for Kubernetes Monitors", "type": "MonitorsLibraryFolderExport", "children": [ { "name": "Kubernetes - Kube Node Not Ready", "description": "This alert is fired when a node is not ready.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=kube_node_status_condition job=kube-state-metrics condition=ready status=true | sum by cluster,node" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Critical", "timeRange": "-5m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "timeRange": "-5m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "MissingData", "timeRange": "-5m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "MissingData", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedMissingData", "timeRange": "-5m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "MissingData", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - DaemonSet Not Scheduled", "description": "This alert is fired when DaemonSet pods are not scheduled.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=kube_daemonset_status_desired_number_scheduled job=kube-state-metrics" }, { "rowId": "B", "query": "$$kubernetes_data_source metric=kube_daemonset_status_current_number_scheduled job=kube-state-metrics" }, { "rowId": "C", "query": "#A - #B along daemonset, cluster" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Warning", "timeRange": "-5m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedWarning", "timeRange": "-5m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - etcd Insufficient Members", "description": "This alert is fired when we determine that etcd cluster has insufficient members.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=up job=*etcd* | sum by job" }, { "rowId": "B", "query": "$$kubernetes_data_source metric=up job=*etcd* | count by job | eval _value + 1 | eval _value /2" }, { "rowId": "C", "query": "#B - #A along cluster" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Critical", "timeRange": "-5m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "timeRange": "-5m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - Pod Crash Looping", "description": "This alert is fired when we determine that a pod is crash looping.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=kube_pod_container_status_restarts_total job=kube-state-metrics | quantize 5m | rate | eval _value * 60 * 5" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Warning", "timeRange": "-15m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedWarning", "timeRange": "-15m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - Multiple Terminated Pods (Errored Out)", "description": "This alert is fired when we determine that there are pods that have been terminated because of errors.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=kube_pod_container_status_terminated_reason reason=error | sum by cluster" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Critical", "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThan", "occurrenceType": "AtLeastOnce", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - Multiple Terminated Pods (OOM Killed)", "description": "This alert is fired when we determine that there are pods that have been OOM Killed.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=kube_pod_container_status_terminated_reason reason=oomkilled | sum by cluster" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Critical", "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThan", "occurrenceType": "AtLeastOnce", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - Container Waiting", "description": "This alert is fired when a pod container waiting longer than 1 hour.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=kube_pod_container_status_waiting_reason job=kube-state-metrics | sum by cluster, namespace, container, pod" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Warning", "timeRange": "-5m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedWarning", "timeRange": "-5m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - Kubelet Down", "description": "This alert is fired when Kubelet disappears from Prometheus target discovery.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=up job=kubelet metrics_path=/metrics | sum by cluster,instance,service" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Critical", "timeRange": "-15m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "timeRange": "-15m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "MissingData", "timeRange": "-15m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "MissingData", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedMissingData", "timeRange": "-15m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "MissingData", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - Kube API Down", "description": "This alert is fired when KubeAPI disappears from Prometheus target discovery.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=up job=apiserver | sum by cluster,instance,service" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Critical", "timeRange": "-15m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "timeRange": "-15m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "MissingData", "timeRange": "-15m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "MissingData", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedMissingData", "timeRange": "-15m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "MissingData", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - Node CPU Utilization High", "description": "This alert is fired when Node CPU utlization is high.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=node_cpu_seconds_total mode=idle | rate | avg by cluster,node | eval 1 - _value" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Critical", "timeRange": "-5m", "threshold": 0.9, "thresholdType": "GreaterThan", "occurrenceType": "AtLeastOnce", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "timeRange": "-5m", "threshold": 0.9, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "Warning", "timeRange": "-5m", "threshold": 0.75, "thresholdType": "GreaterThan", "occurrenceType": "AtLeastOnce", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedWarning", "timeRange": "-5m", "threshold": 0.75, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - Prometheus Remote Storage Failures", "description": "This alert is fired when Prometheus fails to send samples to remote storage.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=prometheus_remote_storage_failed_samples_total (job=collection-kube-prometheus-prometheus or job=collection-prometheus-oper-prometheus) | quantize 5m | rate" }, { "rowId": "B", "query": "$$kubernetes_data_source metric=prometheus_remote_storage_succeeded_samples_total (job=collection-kube-prometheus-prometheus or job=collection-prometheus-oper-prometheus) | quantize 5m | rate" }, { "rowId": "C", "query": "100*(#A/(#A+#B)) along namespace,pod" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Critical", "timeRange": "-5m", "threshold": 1, "thresholdType": "GreaterThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "timeRange": "-5m", "threshold": 1, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - Kube Controller Manager Down", "description": "This alert is fired when KubeControllerManager disappears from Prometheus target discovery.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=up job=kube-controller-manager | sum by cluster,instance,service" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Critical", "timeRange": "-15m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "timeRange": "-15m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "MissingData", "timeRange": "-15m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "MissingData", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedMissingData", "timeRange": "-15m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "MissingData", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - DaemonSet Misscheduled", "description": "This alert is fired when DaemonSet pods are miss-scheduled.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=kube_daemonset_status_number_misscheduled job=kube-state-metrics | sum by cluster" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Warning", "timeRange": "-15m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedWarning", "timeRange": "-15m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - Kube Scheduler Down", "description": "This alert is fired when Kube Scheduler disappears from Prometheus target discovery.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=up job=scheduler | sum by cluster,instance,service" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Critical", "timeRange": "-15m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "timeRange": "-15m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "MissingData", "timeRange": "-15m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "MissingData", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedMissingData", "timeRange": "-15m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "MissingData", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - StatefulSet Generation Mismatch", "description": "This alert is fired when StatefulSet generation mismatch is determined due to possible roll-back.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=kube_statefulset_status_observed_generation job=kube-state-metrics" }, { "rowId": "B", "query": "$$kubernetes_data_source metric=kube_statefulset_metadata_generation job=kube-state-metrics" }, { "rowId": "C", "query": "#B - #A along statefulset, cluster" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Warning", "timeRange": "-15m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedWarning", "timeRange": "-15m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - Multiple Terminated Pods (Deadline Exceeded)", "description": "This alert is fired when we determine that there are pods that have been terminated.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=kube_pod_container_status_terminated_reason reason=deadlineexceeded | sum by cluster" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Critical", "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThan", "occurrenceType": "AtLeastOnce", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - Multiple Terminated Pods (Container Cannot Run)", "description": "This alert is fired when we determine that there are pods that have been terminated as the container cannot run.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=kube_pod_container_status_terminated_reason reason=containercannotrun | sum by cluster" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Critical", "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThan", "occurrenceType": "AtLeastOnce", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - Multiple Containers OOM Killed", "description": "This alert is fired when multiple containers are OOM Killed.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=kube_pod_container_status_restarts_total | quantize to 12m | rate increasing | max by namespace, container, pod" }, { "rowId": "B", "query": "$$kubernetes_data_source metric=kube_pod_container_status_last_terminated_reason reason=\"OOMKilled\" | max by namespace, container, pod | filter max =1" }, { "rowId": "C", "query": "#A + #B " } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Warning", "timeRange": "-15m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedWarning", "timeRange": "-15m", "threshold": 5, "thresholdType": "LessThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true }, { "name": "Kubernetes - HPA Maxed Out", "description": "This alert is fired when HPA is running at maximum replicas.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "queries": [ { "rowId": "A", "query": "$$kubernetes_data_source metric=kube_hpa_status_current_replicas job=kube-state-metrics" }, { "rowId": "B", "query": "$$kubernetes_data_source metric=kube_hpa_spec_max_replicas job=kube-state-metrics" }, { "rowId": "C", "query": "#B - #A along cluster" } ], "triggers": [ { "detectionMethod": "StaticCondition", "triggerType": "Warning", "timeRange": "-15m", "threshold": 0, "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedWarning", "timeRange": "-15m", "threshold": 0, "thresholdType": "GreaterThan", "occurrenceType": "Always", "triggerSource": "AnyTimeSeries" } ], "notifications": [], "isDisabled": true, "groupNotifications": true } ] }