commit 4e7e9cd92654cfdb425f6a658f91c3cec8423e8c Author: nkey Date: Thu Jan 22 11:06:46 2026 +0900 Worker-Monitor-Node Stack & README diff --git a/README.md b/README.md new file mode 100644 index 0000000..ad23566 --- /dev/null +++ b/README.md @@ -0,0 +1,113 @@ +# rpi-worker-monitor-node +작성: AI / 수정: nkey + +라즈베리파이 **모니터링 워커 노드**에서 Prometheus/Alertmanager/Loki/Grafana를 **Docker Compose**로 구동하기 위한 레포입니다. + +- `monitoring/docker-compose.yml`: 모니터링 스택 컨테이너 실행 +- `monitoring/prometheus/*`: Prometheus 설정/알람 룰 +- `monitoring/alertmanager/*`: Alertmanager 라우팅/웹훅 +- `monitoring/loki/*`: Loki 스토리지/인덱싱 설정 + +docker compose로 배포하던 방식을 k3s로 배포하는 방식으로 수정하면서 master node에서 한번에 배포 진행합니다. (참고: `rpi-master-node/k3s-monitoring`) +아래 내용들은 docker compose로 하던 방식에 대한 설명이니 참고만 부탁드립니다. + + +## Quickstart +### Recommended +```bash +# clone +git clone https://nkeystudy.site/gitea/2025-capstone/rpi-worker-monitor-node.git +cd rpi-worker-monitor-node/monitoring + +# run +docker compose up -d + +# verify +docker compose ps +curl -s http://localhost:9090/-/ready +curl -s http://localhost:9093/-/ready +curl -s http://localhost:3100/ready +``` + +## Requirements +- Runtime/Language: Docker Engine + Docker Compose +- Dependencies: 없음(스택은 모두 컨테이너로 구동) +- Tools: `docker`, `docker compose` + +## Configuration +### Environment Variables +| Key | Description | Default | Required | +|---|---|---:|:---:| +| TZ | 컨테이너 타임존 | `Asia/Seoul` | | +| GF_SECURITY_ADMIN_USER | Grafana 관리자 계정 | `admin` | | +| GF_SECURITY_ADMIN_PASSWORD | Grafana 관리자 비밀번호 | `admin1234` | | + +### Ports +| Service | Port | Description | +|---|---:|---| +| Prometheus | 9090 | Prometheus UI/API | +| Alertmanager | 9093 | Alertmanager UI/API | +| Loki | 3100 | Loki API | +| Grafana | 3000 | Grafana UI | +| Node-Exporter | 9100 | Prometheus scraping | +| Promtail | 9080 | Prometheus scraping | + +## Usage (minimal) +- 실행/중지 +```bash +cd monitoring +docker compose up -d +docker compose down +``` + +- Prometheus 설정 변경 후 reload +```bash +# docker-compose.yml 에서 --web.enable-lifecycle 활성화되어 있음 +curl -X POST http://localhost:9090/-/reload +``` + +## 설정 파일 설명 (이 레포에 포함된 것만) +### Docker Compose +- `monitoring/docker-compose.yml` + - `prometheus`, `alertmanager`, `loki`, `grafana` 4개 서비스를 동일 네트워크에서 실행 + - Prometheus: + - `./prometheus/prometheus.yml`, `./prometheus/alert.rules.yml`을 컨테이너에 마운트 + - 데이터는 `prometheus-data` 볼륨에 저장 + - `--web.enable-lifecycle`로 `/-/reload` 지원 + - Alertmanager: + - `./alertmanager/alertmanager.yml` 마운트 + - 데이터는 `alertmanager-data` 볼륨에 저장 + - Loki: + - `./loki/loki-config.yml` 마운트 + - 데이터는 `loki-data` 볼륨에 저장 + - Grafana: + - 데이터는 `grafana-data` 볼륨에 저장 + - 관리자 계정/비번을 env로 주입 + +### Prometheus +- `monitoring/prometheus/prometheus.yml` + - 스크레이프 타겟: + - `node-exporter`를 노드별 `:9100`으로 수집(`rpi-master:9100`, `rpi-worker-monitor:9100`, `rpi-worker-service:9100`) + - `kube-state-metrics`는 `10.43.217.216:8080`로 설정(클러스터 환경에 맞게 조정 필요) + - Alertmanager 타겟: `alertmanager:9093` + - 룰 파일: `/etc/prometheus/alert.rules.yml` +- `monitoring/prometheus/alert.rules.yml` + - 노드 상태/리소스 알람: + - `NodeDown`, `HighCpuLoad`, `HighMemoryUsage`, `DiskAlmostFull(>0.85)`, `HighSystemLoad` + - K8s 워크로드 알람: + - `KubeNodeNotReady`, `PodCrashLooping`, `PodNotReady`, `DeploymentReplicasMismatch` + +### Alertmanager +- `monitoring/alertmanager/alertmanager.yml` + - `severity="critical"` 알람을 `n8n-webhook`으로 라우팅 + - Webhook URL: `http:///n8n/webhook/alert` + - `send_resolved: true`로 해제 이벤트도 전송 + +### Loki +- `monitoring/loki/loki-config.yml` + - 단일 인스턴스 Loki 설정(filesystem 기반) + - `chunks_directory`, `active_index_directory` 등을 `/loki/*` 하위로 사용 + - `auth_enabled: false`(내부 데모 용도) + +## License +(Not specified in repo) diff --git a/monitoring/alertmanager/alertmanager.yml b/monitoring/alertmanager/alertmanager.yml new file mode 100644 index 0000000..f331f19 --- /dev/null +++ b/monitoring/alertmanager/alertmanager.yml @@ -0,0 +1,25 @@ +global: + resolve_timeout: 5m + +route: + receiver: 'default' + group_by: ['alertname', 'instance'] + group_wait: 30s + group_interval: 2m + repeat_interval: 4h + + routes: + - receiver: 'n8n-webhook' + matchers: + - 'severity="critical"' + group_wait: 10s + group_interval: 1m + repeat_interval: 2h + +receivers: + - name: 'default' + + - name: 'n8n-webhook' + webhook_configs: + - url: 'http:///n8n/webhook/alert' + send_resolved: true diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000..e04831a --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,81 @@ +version: "3.8" + +services: + prometheus: + image: prom/prometheus:latest + container_name: prometheus + restart: unless-stopped + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - ./prometheus/alert.rules.yml:/etc/prometheus/alert.rules.yml + - prometheus-data:/prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--web.enable-lifecycle" + ports: + - "9090:9090" + networks: + - monitoring-net + environment: + - TZ=Asia/Seoul + + alertmanager: + image: prom/alertmanager:latest + container_name: alertmanager + restart: unless-stopped + volumes: + - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml + - alertmanager-data:/alertmanager + command: + - "--config.file=/etc/alertmanager/alertmanager.yml" + ports: + - "9093:9093" + networks: + - monitoring-net + environment: + - TZ=Asia/Seoul + + loki: + image: grafana/loki:2.9.0 + container_name: loki + restart: unless-stopped + volumes: + - ./loki/loki-config.yml:/etc/loki/config.yml + - loki-data:/loki + command: + - "-config.file=/etc/loki/config.yml" + ports: + - "3100:3100" + networks: + - monitoring-net + environment: + - TZ=Asia/Seoul + + grafana: + image: grafana/grafana:latest + container_name: grafana + restart: unless-stopped + depends_on: + - prometheus + - loki + ports: + - "3000:3000" + networks: + - monitoring-net + environment: + - TZ=Asia/Seoul + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin1234 + volumes: + - grafana-data:/var/lib/grafana + +volumes: + prometheus-data: + alertmanager-data: + loki-data: + grafana-data: + +networks: + monitoring-net: + driver: bridge diff --git a/monitoring/loki/loki-config.yml b/monitoring/loki/loki-config.yml new file mode 100644 index 0000000..2096f2b --- /dev/null +++ b/monitoring/loki/loki-config.yml @@ -0,0 +1,59 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + ring: + kvstore: + store: inmemory + +ingester: + lifecycler: + ring: + kvstore: + store: inmemory + replication_factor: 1 + chunk_idle_period: 5m + chunk_block_size: 262144 + chunk_target_size: 1048576 + chunk_retain_period: 1m + max_transfer_retries: 0 + +schema_config: + configs: + - from: 2020-10-24 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h + +storage_config: + boltdb_shipper: + active_index_directory: /loki/index + shared_store: filesystem + +limits_config: + ingestion_rate_mb: 10 + ingestion_burst_size_mb: 20 + max_cache_freshness_per_query: 10m + +chunk_store_config: + max_look_back_period: 0 + +query_range: + align_queries_with_step: true + max_retries: 5 + +frontend: + max_outstanding_per_tenant: 1024 + diff --git a/monitoring/prometheus/alert.rules.yml b/monitoring/prometheus/alert.rules.yml new file mode 100644 index 0000000..ac0e071 --- /dev/null +++ b/monitoring/prometheus/alert.rules.yml @@ -0,0 +1,100 @@ +groups: + - name: node-health-rules + rules: + - alert: NodeDown + expr: up{job="node-exporter"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Node {{ $labels.instance }} is down" + description: "No response from node-exporter on {{ $labels.instance }} for more than 1 minute." + + - alert: HighCpuLoad + expr: avg by (instance) (rate(node_cpu_seconds_total{mode!="idle",job="node-exporter"}[5m])) > 0.7 + for: 5m + labels: + severity: critical + annotations: + summary: "High CPU usage on {{ $labels.instance }}" + description: "Average CPU usage over 5 minutes is above 70% on {{ $labels.instance }}." + + - alert: HighMemoryUsage + expr: 1 - (node_memory_MemAvailable_bytes{job="node-exporter"} + / node_memory_MemTotal_bytes{job="node-exporter"}) > 0.8 + for: 5m + labels: + severity: critical + annotations: + summary: "High memory usage on {{ $labels.instance }}" + description: "Memory usage is above 80% for more than 5 minutes on {{ $labels.instance }}." + + - alert: DiskAlmostFull + expr: 1 - ( + node_filesystem_avail_bytes{job="node-exporter",fstype!~"tmpfs|overlay"} + / node_filesystem_size_bytes{job="node-exporter",fstype!~"tmpfs|overlay"} + ) > 0.85 + for: 1m + labels: + severity: critical + annotations: + summary: "Disk almost full on {{ $labels.instance }} ({{ $labels.mountpoint }})" + description: "Filesystem {{ $labels.mountpoint }} is above 85% usage for more than 10 minutes on {{ $labels.instance }}." + + - alert: HighSystemLoad + expr: ( + avg by (instance) (node_load5{job="node-exporter"}) + / + on (instance) count by (instance) (node_cpu_seconds_total{job="node-exporter",mode="idle"}) + ) > 2 + for: 1m + labels: + severity: critical + annotations: + summary: "High load average on {{ $labels.instance }}" + description: "5-minute load average per CPU core is above 2 for more than 10 minutes on {{ $labels.instance }}." + + - name: kubernetes-workload-rules + rules: + - alert: KubeNodeNotReady + expr: kube_node_status_condition{condition="Ready",status="true",job="kube-state-metrics"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes node {{ $labels.node }} is NotReady" + description: "Kubernetes node {{ $labels.node }} has been in NotReady state for more than 5 minutes." + + - alert: PodCrashLooping + expr: increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) > 3 + for: 0m + labels: + severity: critical + annotations: + summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping" + description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} restarted more than 3 times in the last 5 minutes." + + - alert: PodNotReady + expr: ( + kube_pod_status_ready{condition="true",job="kube-state-metrics"} == 0 + ) + and on (namespace, pod) + ( + kube_pod_status_phase{phase="Running",job="kube-state-metrics"} == 1 + ) + for: 10m + labels: + severity: critical + annotations: + summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not Ready" + description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is Running but not Ready for more than 10 minutes." + + - alert: DeploymentReplicasMismatch + expr: kube_deployment_status_replicas_available{job="kube-state-metrics"} + < kube_deployment_spec_replicas{job="kube-state-metrics"} + for: 1m + labels: + severity: critical + annotations: + summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has unavailable replicas" + description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has fewer available replicas than desired for more than 5 minutes." diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..1b85e04 --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,32 @@ +global: + scrape_interval: 30s + evaluation_interval: 30s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['prometheus:9090'] + + - job_name: 'node-exporter' + static_configs: + - targets: + - 'rpi-master:9100' + - 'rpi-worker-monitor:9100' + - 'rpi-worker-service:9100' + + - job_name: 'kube-state-metrics' + scrape_interval: 30s + static_configs: + - targets: + - '10.43.217.216:8080' + labels: + service: 'kube-state-metrics' + +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + +rule_files: + - "/etc/prometheus/alert.rules.yml" +