rpi-worker-monitor-node/monitoring/prometheus/alert.rules.yml

groups:
  - name: node-health-rules
    rules:
      - alert: NodeDown
        expr: up{job="node-exporter"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Node {{ $labels.instance }} is down"
          description: "No response from node-exporter on {{ $labels.instance }} for more than 1 minute."

      - alert: HighCpuLoad
        expr: avg by (instance) (rate(node_cpu_seconds_total{mode!="idle",job="node-exporter"}[5m])) > 0.7
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "Average CPU usage over 5 minutes is above 70% on {{ $labels.instance }}."

      - alert: HighMemoryUsage
        expr: 1 - (node_memory_MemAvailable_bytes{job="node-exporter"}
                   / node_memory_MemTotal_bytes{job="node-exporter"}) > 0.8
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: "Memory usage is above 80% for more than 5 minutes on {{ $labels.instance }}."

      - alert: DiskAlmostFull
        expr: 1 - (
                node_filesystem_avail_bytes{job="node-exporter",fstype!~"tmpfs|overlay"}
                / node_filesystem_size_bytes{job="node-exporter",fstype!~"tmpfs|overlay"}
              ) > 0.85
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Disk almost full on {{ $labels.instance }} ({{ $labels.mountpoint }})"
          description: "Filesystem {{ $labels.mountpoint }} is above 85% usage for more than 10 minutes on {{ $labels.instance }}."

      - alert: HighSystemLoad
        expr: (
                avg by (instance) (node_load5{job="node-exporter"})
                /
                on (instance) count by (instance) (node_cpu_seconds_total{job="node-exporter",mode="idle"})
              ) > 2
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "High load average on {{ $labels.instance }}"
          description: "5-minute load average per CPU core is above 2 for more than 10 minutes on {{ $labels.instance }}."

  - name: kubernetes-workload-rules
    rules:
      - alert: KubeNodeNotReady
        expr: kube_node_status_condition{condition="Ready",status="true",job="kube-state-metrics"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Kubernetes node {{ $labels.node }} is NotReady"
          description: "Kubernetes node {{ $labels.node }} has been in NotReady state for more than 5 minutes."

      - alert: PodCrashLooping
        expr: increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) > 3
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
          description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} restarted more than 3 times in the last 5 minutes."

      - alert: PodNotReady
        expr: (
                kube_pod_status_ready{condition="true",job="kube-state-metrics"} == 0
              )
              and on (namespace, pod)
              (
                kube_pod_status_phase{phase="Running",job="kube-state-metrics"} == 1
              )
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not Ready"
          description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is Running but not Ready for more than 10 minutes."

      - alert: DeploymentReplicasMismatch
        expr: kube_deployment_status_replicas_available{job="kube-state-metrics"}
              < kube_deployment_spec_replicas{job="kube-state-metrics"}
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has unavailable replicas"
          description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has fewer available replicas than desired for more than 5 minutes."