101 lines
4.2 KiB
YAML
101 lines
4.2 KiB
YAML
groups:
|
|
- name: node-health-rules
|
|
rules:
|
|
- alert: NodeDown
|
|
expr: up{job="node-exporter"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Node {{ $labels.instance }} is down"
|
|
description: "No response from node-exporter on {{ $labels.instance }} for more than 1 minute."
|
|
|
|
- alert: HighCpuLoad
|
|
expr: avg by (instance) (rate(node_cpu_seconds_total{mode!="idle",job="node-exporter"}[5m])) > 0.7
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "High CPU usage on {{ $labels.instance }}"
|
|
description: "Average CPU usage over 5 minutes is above 70% on {{ $labels.instance }}."
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: 1 - (node_memory_MemAvailable_bytes{job="node-exporter"}
|
|
/ node_memory_MemTotal_bytes{job="node-exporter"}) > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is above 80% for more than 5 minutes on {{ $labels.instance }}."
|
|
|
|
- alert: DiskAlmostFull
|
|
expr: 1 - (
|
|
node_filesystem_avail_bytes{job="node-exporter",fstype!~"tmpfs|overlay"}
|
|
/ node_filesystem_size_bytes{job="node-exporter",fstype!~"tmpfs|overlay"}
|
|
) > 0.85
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Disk almost full on {{ $labels.instance }} ({{ $labels.mountpoint }})"
|
|
description: "Filesystem {{ $labels.mountpoint }} is above 85% usage for more than 10 minutes on {{ $labels.instance }}."
|
|
|
|
- alert: HighSystemLoad
|
|
expr: (
|
|
avg by (instance) (node_load5{job="node-exporter"})
|
|
/
|
|
on (instance) count by (instance) (node_cpu_seconds_total{job="node-exporter",mode="idle"})
|
|
) > 2
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "High load average on {{ $labels.instance }}"
|
|
description: "5-minute load average per CPU core is above 2 for more than 10 minutes on {{ $labels.instance }}."
|
|
|
|
- name: kubernetes-workload-rules
|
|
rules:
|
|
- alert: KubeNodeNotReady
|
|
expr: kube_node_status_condition{condition="Ready",status="true",job="kube-state-metrics"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Kubernetes node {{ $labels.node }} is NotReady"
|
|
description: "Kubernetes node {{ $labels.node }} has been in NotReady state for more than 5 minutes."
|
|
|
|
- alert: PodCrashLooping
|
|
expr: increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) > 3
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
|
|
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} restarted more than 3 times in the last 5 minutes."
|
|
|
|
- alert: PodNotReady
|
|
expr: (
|
|
kube_pod_status_ready{condition="true",job="kube-state-metrics"} == 0
|
|
)
|
|
and on (namespace, pod)
|
|
(
|
|
kube_pod_status_phase{phase="Running",job="kube-state-metrics"} == 1
|
|
)
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not Ready"
|
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is Running but not Ready for more than 10 minutes."
|
|
|
|
- alert: DeploymentReplicasMismatch
|
|
expr: kube_deployment_status_replicas_available{job="kube-state-metrics"}
|
|
< kube_deployment_spec_replicas{job="kube-state-metrics"}
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has unavailable replicas"
|
|
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has fewer available replicas than desired for more than 5 minutes."
|