Worker-Monitor-Node Stack & README

This commit is contained in:
2026-01-22 11:06:46 +09:00
commit 4e7e9cd926
6 changed files with 410 additions and 0 deletions

View File

@@ -0,0 +1,25 @@
global:
resolve_timeout: 5m
route:
receiver: 'default'
group_by: ['alertname', 'instance']
group_wait: 30s
group_interval: 2m
repeat_interval: 4h
routes:
- receiver: 'n8n-webhook'
matchers:
- 'severity="critical"'
group_wait: 10s
group_interval: 1m
repeat_interval: 2h
receivers:
- name: 'default'
- name: 'n8n-webhook'
webhook_configs:
- url: 'http://<IP>/n8n/webhook/alert'
send_resolved: true

View File

@@ -0,0 +1,81 @@
version: "3.8"
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: unless-stopped
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/alert.rules.yml:/etc/prometheus/alert.rules.yml
- prometheus-data:/prometheus
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.enable-lifecycle"
ports:
- "9090:9090"
networks:
- monitoring-net
environment:
- TZ=Asia/Seoul
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: unless-stopped
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager-data:/alertmanager
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
ports:
- "9093:9093"
networks:
- monitoring-net
environment:
- TZ=Asia/Seoul
loki:
image: grafana/loki:2.9.0
container_name: loki
restart: unless-stopped
volumes:
- ./loki/loki-config.yml:/etc/loki/config.yml
- loki-data:/loki
command:
- "-config.file=/etc/loki/config.yml"
ports:
- "3100:3100"
networks:
- monitoring-net
environment:
- TZ=Asia/Seoul
grafana:
image: grafana/grafana:latest
container_name: grafana
restart: unless-stopped
depends_on:
- prometheus
- loki
ports:
- "3000:3000"
networks:
- monitoring-net
environment:
- TZ=Asia/Seoul
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin1234
volumes:
- grafana-data:/var/lib/grafana
volumes:
prometheus-data:
alertmanager-data:
loki-data:
grafana-data:
networks:
monitoring-net:
driver: bridge

View File

@@ -0,0 +1,59 @@
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
ring:
kvstore:
store: inmemory
ingester:
lifecycler:
ring:
kvstore:
store: inmemory
replication_factor: 1
chunk_idle_period: 5m
chunk_block_size: 262144
chunk_target_size: 1048576
chunk_retain_period: 1m
max_transfer_retries: 0
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: /loki/index
shared_store: filesystem
limits_config:
ingestion_rate_mb: 10
ingestion_burst_size_mb: 20
max_cache_freshness_per_query: 10m
chunk_store_config:
max_look_back_period: 0
query_range:
align_queries_with_step: true
max_retries: 5
frontend:
max_outstanding_per_tenant: 1024

View File

@@ -0,0 +1,100 @@
groups:
- name: node-health-rules
rules:
- alert: NodeDown
expr: up{job="node-exporter"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.instance }} is down"
description: "No response from node-exporter on {{ $labels.instance }} for more than 1 minute."
- alert: HighCpuLoad
expr: avg by (instance) (rate(node_cpu_seconds_total{mode!="idle",job="node-exporter"}[5m])) > 0.7
for: 5m
labels:
severity: critical
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "Average CPU usage over 5 minutes is above 70% on {{ $labels.instance }}."
- alert: HighMemoryUsage
expr: 1 - (node_memory_MemAvailable_bytes{job="node-exporter"}
/ node_memory_MemTotal_bytes{job="node-exporter"}) > 0.8
for: 5m
labels:
severity: critical
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is above 80% for more than 5 minutes on {{ $labels.instance }}."
- alert: DiskAlmostFull
expr: 1 - (
node_filesystem_avail_bytes{job="node-exporter",fstype!~"tmpfs|overlay"}
/ node_filesystem_size_bytes{job="node-exporter",fstype!~"tmpfs|overlay"}
) > 0.85
for: 1m
labels:
severity: critical
annotations:
summary: "Disk almost full on {{ $labels.instance }} ({{ $labels.mountpoint }})"
description: "Filesystem {{ $labels.mountpoint }} is above 85% usage for more than 10 minutes on {{ $labels.instance }}."
- alert: HighSystemLoad
expr: (
avg by (instance) (node_load5{job="node-exporter"})
/
on (instance) count by (instance) (node_cpu_seconds_total{job="node-exporter",mode="idle"})
) > 2
for: 1m
labels:
severity: critical
annotations:
summary: "High load average on {{ $labels.instance }}"
description: "5-minute load average per CPU core is above 2 for more than 10 minutes on {{ $labels.instance }}."
- name: kubernetes-workload-rules
rules:
- alert: KubeNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true",job="kube-state-metrics"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Kubernetes node {{ $labels.node }} is NotReady"
description: "Kubernetes node {{ $labels.node }} has been in NotReady state for more than 5 minutes."
- alert: PodCrashLooping
expr: increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) > 3
for: 0m
labels:
severity: critical
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} restarted more than 3 times in the last 5 minutes."
- alert: PodNotReady
expr: (
kube_pod_status_ready{condition="true",job="kube-state-metrics"} == 0
)
and on (namespace, pod)
(
kube_pod_status_phase{phase="Running",job="kube-state-metrics"} == 1
)
for: 10m
labels:
severity: critical
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not Ready"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is Running but not Ready for more than 10 minutes."
- alert: DeploymentReplicasMismatch
expr: kube_deployment_status_replicas_available{job="kube-state-metrics"}
< kube_deployment_spec_replicas{job="kube-state-metrics"}
for: 1m
labels:
severity: critical
annotations:
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has unavailable replicas"
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has fewer available replicas than desired for more than 5 minutes."

View File

@@ -0,0 +1,32 @@
global:
scrape_interval: 30s
evaluation_interval: 30s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['prometheus:9090']
- job_name: 'node-exporter'
static_configs:
- targets:
- 'rpi-master:9100'
- 'rpi-worker-monitor:9100'
- 'rpi-worker-service:9100'
- job_name: 'kube-state-metrics'
scrape_interval: 30s
static_configs:
- targets:
- '10.43.217.216:8080'
labels:
service: 'kube-state-metrics'
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
rule_files:
- "/etc/prometheus/alert.rules.yml"