From c5b1ea43a4aff1da27754155b147e8ba233e7ac6 Mon Sep 17 00:00:00 2001 From: nkey Date: Thu, 22 Jan 2026 11:11:29 +0900 Subject: [PATCH] Master-Node Stack & README --- README.md | 289 +++++++++++++++++++++ k3s-manifests/aam-deploy.yaml | 66 +++++ k3s-manifests/disk-fill-demo.yaml | 48 ++++ k3s-monitoring/alertmanager-config.yml | 32 +++ k3s-monitoring/alertmanager-deploy.yml | 50 ++++ k3s-monitoring/grafana-config.yml | 18 ++ k3s-monitoring/grafana-deploy.yml | 51 ++++ k3s-monitoring/loki-config.yml | 65 +++++ k3s-monitoring/loki-deploy.yml | 48 ++++ k3s-monitoring/prometheus-config.yml | 162 ++++++++++++ k3s-monitoring/prometheus-deploy.yml | 52 ++++ kube-state-metrics/kube-state-metrics.yaml | 99 +++++++ 12 files changed, 980 insertions(+) create mode 100644 README.md create mode 100644 k3s-manifests/aam-deploy.yaml create mode 100644 k3s-manifests/disk-fill-demo.yaml create mode 100644 k3s-monitoring/alertmanager-config.yml create mode 100644 k3s-monitoring/alertmanager-deploy.yml create mode 100644 k3s-monitoring/grafana-config.yml create mode 100644 k3s-monitoring/grafana-deploy.yml create mode 100644 k3s-monitoring/loki-config.yml create mode 100644 k3s-monitoring/loki-deploy.yml create mode 100644 k3s-monitoring/prometheus-config.yml create mode 100644 k3s-monitoring/prometheus-deploy.yml create mode 100644 kube-state-metrics/kube-state-metrics.yaml diff --git a/README.md b/README.md new file mode 100644 index 0000000..a637090 --- /dev/null +++ b/README.md @@ -0,0 +1,289 @@ +# rpi-master-node +작성: AI / 수정: nkey + +k3s 클러스터에서 **매니페스트(데모/모니터링/AAM)** 를 적용하기 위한 master 노드 레포입니다. + +- `k3s-manifests/`: 데모/서비스 매니페스트 +- `k3s-monitoring/`: Prometheus/Alertmanager/Loki/Grafana (K8s 배포용) +- `kube-state-metrics/`: kube-state-metrics 배포 + +## Quickstart +### Recommended +```bash +# clone +git clone https://nkeystudy.site/gitea/2025-capstone/rpi-master-node.git +cd rpi-master-node + +# (옵션) monitoring 네임스페이스가 없다면 생성 +kubectl get ns monitoring >/dev/null 2>&1 || kubectl create ns monitoring + +# kube-state-metrics +kubectl apply -f kube-state-metrics/kube-state-metrics.yaml + +# monitoring stack (ConfigMap → Deploy 순서 권장) +kubectl apply -f k3s-monitoring/prometheus-config.yml +kubectl apply -f k3s-monitoring/alertmanager-config.yml +kubectl apply -f k3s-monitoring/loki-config.yml +kubectl apply -f k3s-monitoring/grafana-config.yml + +kubectl apply -f k3s-monitoring/prometheus-deploy.yml +kubectl apply -f k3s-monitoring/alertmanager-deploy.yml +kubectl apply -f k3s-monitoring/loki-deploy.yml +kubectl apply -f k3s-monitoring/grafana-deploy.yml + +# demo workloads +kubectl apply -f k3s-manifests/disk-fill-demo.yaml +kubectl apply -f k3s-manifests/aam-deploy.yaml + +# verify +kubectl get svc -n monitoring +kubectl get svc -n alert-service +kubectl get svc -n aam +``` + +## Requirements +- Runtime/Language: Kubernetes(k3s) + `kubectl` +- Dependencies: (K8s) ConfigMap/Deployment/Service/Secret(RBAC 포함) +- Tools: `kubectl` + +## Configuration +### Environment Variables +| Key | Description | Default | Required | +|---|---|---:|:---:| +| NODE_ENV | AAM 실행 환경 | `"production"` | | +| PORT | AAM 컨테이너 포트 | `"9000"` | | +| DRY_RUN | AAM 동작 플래그 | `"false"` | | +| USE_SUDO | AAM 동작 플래그 | `"true"` | | +| SSH_USER | AAM SSH 사용자 | `"pi"` | | +| SSH_KEY_PATH | AAM SSH 키 경로 | `"/app/.ssh/aam_key"` | | +| GF_SECURITY_ADMIN_USER | Grafana 관리자 계정 | `"admin"` | | +| GF_SECURITY_ADMIN_PASSWORD | Grafana 관리자 비밀번호 | `"admin1234"` | | + +> 위 값들은 매니페스트에 하드코딩되어 있습니다. +> AAM은 `aam-ssh-key` Secret을 `/app/.ssh`로 마운트합니다. + +### Ports +| Service | Port | Description | +|---|---:|---| +| prometheus (NodePort) | 30100 → 9090 | Prometheus UI | +| alertmanager (NodePort) | 30101 → 9093 | Alertmanager UI | +| loki (NodePort) | 30102 → 3100 | Loki API (promtail push 대상) | +| grafana (NodePort) | 30103 → 3000 | Grafana UI | +| disk-fill-demo (NodePort) | 30001 → 8000 | 디스크 채우기 데모 서비스 | +| aam-service (NodePort) | 30104 → 9000 | AAM 서비스 | +| kube-state-metrics (ClusterIP) | 8080 | kube-state-metrics metrics | + +## Usage (minimal) +- 디스크 채우기 데모 배포 +```bash +kubectl apply -f k3s-manifests/disk-fill-demo.yaml +``` + +- 모니터링 스택 외부 접근(NodePort) 확인 +```bash +kubectl get svc -n monitoring +``` + +- AAM 배포 및 서비스 확인 +```bash +kubectl apply -f k3s-manifests/aam-deploy.yaml +kubectl get svc -n aam +``` + +## (All nodes) node-exporter 설치 (수동) +> 이 레포에는 node-exporter 설치 스크립트/systemd 유닛이 포함되어 있지 않습니다. +> Prometheus는 `k3s-monitoring/prometheus-config.yml`의 `job_name: node-exporter` 타겟(포트 **9100**)으로 스크레이프합니다. + +### 1) 바이너리 다운로드 및 설치 (ARM64) +```bash +cd /tmp + +# node_exporter 다운로드 (ARM64용) +wget https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-arm64.tar.gz + +# 압축 해제 +tar xvf node_exporter-1.8.2.linux-arm64.tar.gz + +# 실행 파일을 표준 경로로 이동 +sudo mv node_exporter-1.8.2.linux-arm64/node_exporter /usr/local/bin/ + +# 설치 폴더 정리(선택) +rm -rf node_exporter-1.8.2.linux-arm64* +``` + +### 2) 전용 사용자 생성 +```bash +sudo useradd --no-create-home --system node_exporter +sudo chown node_exporter:node_exporter /usr/local/bin/node_exporter +``` + +### 3) systemd 서비스 생성 +```bash +sudo nano /etc/systemd/system/node_exporter.service +``` + +```ini +[Unit] +Description=Prometheus Node Exporter +Wants=network-online.target +After=network-online.target + +[Service] +User=node_exporter +Group=node_exporter +Type=simple +ExecStart=/usr/local/bin/node_exporter \ + --web.listen-address=":9100" \ + --collector.filesystem.ignored-mount-points="^/(sys|proc|dev|run)($|/)" \ + --collector.filesystem.ignored-fs-types="^(sysfs|proc|autofs|devpts|tmpfs|devtmpfs|rpc_pipefs|overlay|squashfs)$" + +Restart=on-failure + +[Install] +WantedBy=multi-user.target +``` + +### 4) 서비스 등록 및 실행 +```bash +sudo systemctl daemon-reload +sudo systemctl enable --now node_exporter + +sudo systemctl status node_exporter +curl http://localhost:9100/metrics | head +``` + +## (All nodes) promtail 설치 (수동) +> 이 레포의 Loki(K8s)는 `k3s-monitoring/loki-deploy.yml`에서 NodePort **30102**로 열려 있습니다. +> 각 노드의 promtail은 해당 Loki로 로그를 push 합니다. + +### 1) 다운로드 및 설치 (ARM64) +```bash +cd /tmp +PROMTAIL_VERSION="2.9.6" + +wget https://github.com/grafana/loki/releases/download/v${PROMTAIL_VERSION}/promtail-linux-arm64.zip +unzip promtail-linux-arm64.zip + +sudo mv promtail-linux-arm64 /usr/local/bin/promtail +sudo chmod 755 /usr/local/bin/promtail +``` + +> promtail 실행 전용 계정(`promtail`)이 필요합니다(계정 생성은 운영 정책에 맞게 진행). + +### 2) 디렉터리 생성 +```bash +sudo mkdir -p /etc/promtail +sudo mkdir -p /var/lib/promtail + +# (전용 계정 사용 시) 권한 부여 +sudo chown -R promtail:promtail /var/lib/promtail +sudo chown -R promtail:promtail /etc/promtail +``` + +### 3) promtail 설정 +```bash +sudo nano /etc/promtail/config.yml +``` + +```yaml +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /var/lib/promtail/positions.yaml + +clients: + # Loki(NodePort 30102)로 push + - url: http://100.104.27.47:30102/loki/api/v1/push + +scrape_configs: + - job_name: varlogs + static_configs: + - targets: [localhost] + labels: + job: varlogs + host: <노드 이름> + __path__: /var/log/*log +``` + +### 4) systemd 서비스 등록 +```bash +sudo nano /etc/systemd/system/promtail.service +``` + +```ini +[Unit] +Description=Promtail service +After=network.target + +[Service] +User=promtail +Group=promtail +Type=simple +ExecStart=/usr/local/bin/promtail -config.file=/etc/promtail/config.yml +Restart=on-failure + +[Install] +WantedBy=multi-user.target +``` + +### 5) 실행/검증 +```bash +sudo systemctl daemon-reload +sudo systemctl enable --now promtail + +sudo systemctl status promtail +sudo systemctl restart promtail +``` + +## 설정 파일 설명 (이 레포에 포함된 것만) +### 데모/서비스 매니페스트 (`k3s-manifests/*`) +- `k3s-manifests/disk-fill-demo.yaml` + - `alert-service` 네임스페이스 생성 + - `disk-fill-demo` Deployment + NodePort Service(30001→8000) + - `nodeSelector: demo-service="true"` 로 **서비스 워커 노드에 스케줄링**되도록 설계 +- `k3s-manifests/aam-deploy.yaml` + - `aam` 네임스페이스 생성 + - `aam-deployment`(image: `sadew1112/aam:0.1.0`) + NodePort Service(30104→9000) + - `nodeName: rpi-worker-monitor`로 모니터 워커 노드에 고정 + - `aam-ssh-key` Secret을 `/app/.ssh`로 마운트(읽기 전용, `0400`) + +### 모니터링 스택 (`k3s-monitoring/*`) +- `k3s-monitoring/prometheus-config.yml` + - Prometheus 스크레이프 타겟 정의: + - `prometheus:9090` + - `node-exporter`: `100.104.27.47:9100`, `100.77.135.86:9100`, `100.90.177.14:9100` + - `kube-state-metrics`: `kube-state-metrics.kube-system.svc.cluster.local:8080` + - Alertmanager 타겟: `alertmanager:9093` + - 알람 룰 포함(NodeDown/HighCpu/HighMemory/DiskAlmostFull 등) + - 주의: `DiskAlmostFull`의 expr는 `> 0.4`로 설정되어 있어(주석은 85%라고 적혀 있으나) **현재는 40% 사용률 초과에서 알람**이 발생합니다. +- `k3s-monitoring/prometheus-deploy.yml` + - Prometheus Deployment + NodePort Service(30100→9090) + - `nodeName: rpi-worker-monitor`로 고정 + - TSDB 저장소는 `emptyDir`(테스트/데모용) +- `k3s-monitoring/alertmanager-config.yml` + - `severity="critical"` 알람을 `n8n-webhook` 리시버로 라우팅 + - Webhook URL: `http:///n8n/webhook/alert` (`send_resolved: true`) +- `k3s-monitoring/alertmanager-deploy.yml` + - Alertmanager Deployment + NodePort Service(30101→9093) + - `nodeName: rpi-worker-monitor`로 고정 +- `k3s-monitoring/loki-config.yml` + - Loki 단일 인스턴스 설정(filesystem 기반 저장: `/loki/chunks`, `/loki/index` 등) + - `auth_enabled: false`(내부 데모 용도) +- `k3s-monitoring/loki-deploy.yml` + - Loki Deployment + NodePort Service(30102→3100) + - 주석대로 **promtail 접근용 NodePort** +- `k3s-monitoring/grafana-config.yml` + - Grafana datasource provisioning + - Prometheus/Loki를 각각 Cluster DNS로 연결 +- `k3s-monitoring/grafana-deploy.yml` + - Grafana Deployment + NodePort Service(30103→3000) + - 관리자 계정/비번 env로 주입(`admin` / `admin1234`) + - 저장소는 `emptyDir`(데모용) + +### kube-state-metrics (`kube-state-metrics/*`) +- `kube-state-metrics/kube-state-metrics.yaml` + - ServiceAccount + ClusterRole/Binding(RBAC) + - Deployment(image: `registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.10.0`, port 8080) + - ClusterIP Service(8080) 제공 diff --git a/k3s-manifests/aam-deploy.yaml b/k3s-manifests/aam-deploy.yaml new file mode 100644 index 0000000..b6ae81b --- /dev/null +++ b/k3s-manifests/aam-deploy.yaml @@ -0,0 +1,66 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: aam +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: aam-deployment + namespace: aam +spec: + replicas: 1 + selector: + matchLabels: + app: aam + template: + metadata: + labels: + app: aam + spec: + nodeName: rpi-worker-monitor + + containers: + - name: aam + image: sadew1112/aam:0.1.0 + imagePullPolicy: Always + env: + - name: NODE_ENV + value: "production" + - name: PORT + value: "9000" + - name: DRY_RUN + value: "false" + - name: USE_SUDO + value: "true" + - name: SSH_USER + value: "pi" + - name: SSH_KEY_PATH + value: "/app/.ssh/aam_key" + ports: + - containerPort: 9000 + name: http + volumeMounts: + - name: ssh-key-volume + mountPath: /app/.ssh + readOnly: true + volumes: + - name: ssh-key-volume + secret: + secretName: aam-ssh-key + defaultMode: 0400 +--- +apiVersion: v1 +kind: Service +metadata: + name: aam-service + namespace: aam +spec: + selector: + app: aam + type: NodePort + ports: + - name: http + port: 9000 + targetPort: 9000 + nodePort: 30104 diff --git a/k3s-manifests/disk-fill-demo.yaml b/k3s-manifests/disk-fill-demo.yaml new file mode 100644 index 0000000..bf3d473 --- /dev/null +++ b/k3s-manifests/disk-fill-demo.yaml @@ -0,0 +1,48 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: alert-service +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: disk-fill-demo + namespace: alert-service + labels: + app: disk-fill-demo +spec: + replicas: 1 + selector: + matchLabels: + app: disk-fill-demo + template: + metadata: + labels: + app: disk-fill-demo + spec: + nodeSelector: + demo-service: "true" + containers: + - name: app + image: nkey01/disk-fill-demo:1.0.3 # 본인 레포로 변경 + ports: + - containerPort: 8000 + # 굳이 readiness/liveness 필요 없음 (그냥 계속 Running 상태만 유지) +--- +apiVersion: v1 +kind: Service +metadata: + name: disk-fill-demo + namespace: alert-service + labels: + app: disk-fill-demo +spec: + type: NodePort + selector: + app: disk-fill-demo + ports: + - name: http + port: 80 + targetPort: 8000 + nodePort: 30001 + diff --git a/k3s-monitoring/alertmanager-config.yml b/k3s-monitoring/alertmanager-config.yml new file mode 100644 index 0000000..adb10b8 --- /dev/null +++ b/k3s-monitoring/alertmanager-config.yml @@ -0,0 +1,32 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: alertmanager-config + namespace: monitoring +data: + alertmanager.yml: | + global: + resolve_timeout: 5m + + route: + receiver: 'default' + group_by: ['alertname', 'instance'] + group_wait: 30s + group_interval: 2m + repeat_interval: 4h + + routes: + - receiver: 'n8n-webhook' + matchers: + - 'severity="critical"' + group_wait: 10s + group_interval: 1m + repeat_interval: 2h + + receivers: + - name: 'default' + + - name: 'n8n-webhook' + webhook_configs: + - url: 'http:///n8n/webhook/alert' + send_resolved: true diff --git a/k3s-monitoring/alertmanager-deploy.yml b/k3s-monitoring/alertmanager-deploy.yml new file mode 100644 index 0000000..1e2bce4 --- /dev/null +++ b/k3s-monitoring/alertmanager-deploy.yml @@ -0,0 +1,50 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alertmanager + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: alertmanager + template: + metadata: + labels: + app: alertmanager + spec: + nodeName: rpi-worker-monitor + containers: + - name: alertmanager + image: prom/alertmanager:latest + args: + - "--config.file=/etc/alertmanager/alertmanager.yml" + - "--storage.path=/alertmanager" + - "--log.level=debug" + ports: + - containerPort: 9093 + volumeMounts: + - name: alertmanager-config + mountPath: /etc/alertmanager + - name: alertmanager-data + mountPath: /alertmanager + volumes: + - name: alertmanager-config + configMap: + name: alertmanager-config + - name: alertmanager-data + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: alertmanager + namespace: monitoring +spec: + selector: + app: alertmanager + type: NodePort + ports: + - port: 9093 + targetPort: 9093 + nodePort: 30101 diff --git a/k3s-monitoring/grafana-config.yml b/k3s-monitoring/grafana-config.yml new file mode 100644 index 0000000..85b0b7b --- /dev/null +++ b/k3s-monitoring/grafana-config.yml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasources + namespace: monitoring +data: + datasources.yml: | + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus.monitoring.svc.cluster.local:9090 + isDefault: true + - name: Loki + type: loki + access: proxy + url: http://loki.monitoring.svc.cluster.local:3100 diff --git a/k3s-monitoring/grafana-deploy.yml b/k3s-monitoring/grafana-deploy.yml new file mode 100644 index 0000000..6d08bff --- /dev/null +++ b/k3s-monitoring/grafana-deploy.yml @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: grafana + template: + metadata: + labels: + app: grafana + spec: + nodeName: rpi-worker-monitor + containers: + - name: grafana + image: grafana/grafana:latest + ports: + - containerPort: 3000 + env: + - name: GF_SECURITY_ADMIN_USER + value: "admin" + - name: GF_SECURITY_ADMIN_PASSWORD + value: "admin1234" # 나중에 바꾸기! + volumeMounts: + - name: grafana-storage + mountPath: /var/lib/grafana + - name: grafana-datasources + mountPath: /etc/grafana/provisioning/datasources + volumes: + - name: grafana-storage + emptyDir: {} + - name: grafana-datasources + configMap: + name: grafana-datasources +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: monitoring +spec: + selector: + app: grafana + type: NodePort + ports: + - port: 3000 + targetPort: 3000 + nodePort: 30103 diff --git a/k3s-monitoring/loki-config.yml b/k3s-monitoring/loki-config.yml new file mode 100644 index 0000000..a9c86f1 --- /dev/null +++ b/k3s-monitoring/loki-config.yml @@ -0,0 +1,65 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: loki-config + namespace: monitoring +data: + loki-config.yml: | + auth_enabled: false + + server: + http_listen_port: 3100 + grpc_listen_port: 9096 + + common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + ring: + kvstore: + store: inmemory + + ingester: + lifecycler: + ring: + kvstore: + store: inmemory + replication_factor: 1 + chunk_idle_period: 5m + chunk_block_size: 262144 + chunk_target_size: 1048576 + chunk_retain_period: 1m + max_transfer_retries: 0 + + schema_config: + configs: + - from: 2020-10-24 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h + + storage_config: + boltdb_shipper: + active_index_directory: /loki/index + shared_store: filesystem + + limits_config: + ingestion_rate_mb: 10 + ingestion_burst_size_mb: 20 + max_cache_freshness_per_query: 10m + + chunk_store_config: + max_look_back_period: 0 + + query_range: + align_queries_with_step: true + max_retries: 5 + + frontend: + max_outstanding_per_tenant: 1024 diff --git a/k3s-monitoring/loki-deploy.yml b/k3s-monitoring/loki-deploy.yml new file mode 100644 index 0000000..8530e05 --- /dev/null +++ b/k3s-monitoring/loki-deploy.yml @@ -0,0 +1,48 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: loki + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: loki + template: + metadata: + labels: + app: loki + spec: + nodeName: rpi-worker-monitor + containers: + - name: loki + image: grafana/loki:2.9.0 # 문서에서 쓰던 버전으로 맞추면 됨 + args: + - "-config.file=/etc/loki/loki-config.yml" + ports: + - containerPort: 3100 + volumeMounts: + - name: loki-config + mountPath: /etc/loki + - name: loki-data + mountPath: /loki + volumes: + - name: loki-config + configMap: + name: loki-config + - name: loki-data + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: loki + namespace: monitoring +spec: + selector: + app: loki + type: NodePort + ports: + - port: 3100 + targetPort: 3100 + nodePort: 30102 # ★ promtail이 접근할 NodePort diff --git a/k3s-monitoring/prometheus-config.yml b/k3s-monitoring/prometheus-config.yml new file mode 100644 index 0000000..847812d --- /dev/null +++ b/k3s-monitoring/prometheus-config.yml @@ -0,0 +1,162 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: monitoring +data: + prometheus.yml: | + global: + scrape_interval: 30s + evaluation_interval: 30s + + scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['prometheus:9090'] + + - job_name: 'node-exporter' + static_configs: + - targets: + - '100.104.27.47:9100' + - '100.77.135.86:9100' + - '100.90.177.14:9100' + + - job_name: 'kube-state-metrics' + scrape_interval: 30s + static_configs: + - targets: + - 'kube-state-metrics.kube-system.svc.cluster.local:8080' + labels: + service: 'kube-state-metrics' + + alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + + rule_files: + - "/etc/prometheus/alert.rules.yml" + alert.rules.yml: | + groups: + # 1) 노드 상태 및 리소스 사용 관련 알람 + - name: node-health-rules + rules: + # ---- 노드 장애 / 다운 감지 ---- + - alert: NodeDown # 규칙 이름 + expr: up{job="node-exporter"} == 0 + # node-exporter가 응답하지 않으면 0으로 떨어짐 + for: 1m # 1분 연속으로 다운일 때 Alert + labels: + severity: critical + annotations: + summary: "Node {{ $labels.instance }} is down" + description: "No response from node-exporter on {{ $labels.instance }} for more than 1 minute." + + # ---- CPU 사용률 과도 ---- + - alert: HighCpuLoad + expr: avg by (instance) (rate(node_cpu_seconds_total{mode!="idle",job="node-exporter"}[5m])) > 0.7 + # 최근 1분 동안 idle 이 아닌 CPU 비율의 평균이 70% 초과 + for: 1m + labels: + severity: critical + annotations: + summary: "High CPU usage on {{ $labels.instance }}" + description: "Average CPU usage over 5 minutes is above 70% on {{ $labels.instance }}." + + # ---- 메모리 사용률 과도 ---- + - alert: HighMemoryUsage + expr: 1 - (node_memory_MemAvailable_bytes{job="node-exporter"} + / node_memory_MemTotal_bytes{job="node-exporter"}) > 0.8 + # 사용률 = 1 - (사용 가능 / 전체) > 0.8 → 80% 이상 사용 + for: 1m + labels: + severity: critical + annotations: + summary: "High memory usage on {{ $labels.instance }}" + description: "Memory usage is above 80% for more than 5 minutes on {{ $labels.instance }}." + + # ---- 디스크 용량 부족 ---- + - alert: DiskAlmostFull + expr: 1 - ( + node_filesystem_avail_bytes{job="node-exporter",fstype!~"tmpfs|overlay"} + / node_filesystem_size_bytes{job="node-exporter",fstype!~"tmpfs|overlay"} + ) > 0.4 + # overlay, tmpfs 같은 일회성/메모리 파일시스템 제외 + # 사용률 > 85% 일 때 알람 + for: 1m + labels: + severity: critical + annotations: + summary: "Disk almost full on {{ $labels.instance }} ({{ $labels.mountpoint }})" + description: "Filesystem {{ $labels.mountpoint }} is above 40% usage for more than 1 minutes on {{ $labels.instance }}." + + # ---- 시스템 Load (코어 대비 과부하) ---- + - alert: HighSystemLoad + expr: ( + avg by (instance) (node_load5{job="node-exporter"}) + / + on (instance) count by (instance) (node_cpu_seconds_total{job="node-exporter",mode="idle"}) + ) > 2 + # 1분 load average / CPU 코어 수 > 2 → 코어 수 대비 부하가 심한 경우 + for: 1m + labels: + severity: critical + annotations: + summary: "High load average on {{ $labels.instance }}" + description: "5-minute load average per CPU core is above 2 for more than 10 minutes on {{ $labels.instance }}." + + # 2) Kubernetes 워크로드 상태 관련 알람 (kube-state-metrics 기준) + - name: kubernetes-workload-rules + rules: + # ---- K8s 노드가 NotReady 상태 ---- + - alert: KubeNodeNotReady + expr: kube_node_status_condition{condition="Ready",status="true",job="kube-state-metrics"} == 0 + # Ready=true 인 시리즈가 0이면, 사실상 NotReady + for: 1m + labels: + severity: critical + annotations: + summary: "Kubernetes node {{ $labels.node }} is NotReady" + description: "Kubernetes node {{ $labels.node }} has been in NotReady state for more than 5 minutes." + + # ---- Pod가 CrashLooping (재시작 반복) ---- + - alert: PodCrashLooping + expr: increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) > 3 + # 5분 동안 같은 컨테이너가 3번 이상 재시작 + for: 5m + labels: + severity: critical + annotations: + summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping" + description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} restarted more than 3 times in the last 5 minutes." + + # ---- Running인데 Ready가 아닌 Pod ---- + - alert: PodNotReady + expr: ( + kube_pod_status_ready{condition="true",job="kube-state-metrics"} == 0 + ) + and on (namespace, pod) + ( + kube_pod_status_phase{phase="Running",job="kube-state-metrics"} == 1 + ) + # Running 상태인데 Ready = false 인 경우 + for: 1m + labels: + severity: critical + annotations: + summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not Ready" + description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is Running but not Ready for more than 10 minutes." + + # ---- Deployment: 원하는 replicas와 실제 가용 replicas 불일치 ---- + - alert: DeploymentReplicasMismatch + expr: kube_deployment_status_replicas_available{job="kube-state-metrics"} + < kube_deployment_spec_replicas{job="kube-state-metrics"} + # 가용 replicas < 설정 replicas → 정상적인 replica 수 유지 실패 + for: 10m + labels: + severity: critical + annotations: + summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has unavailable replicas" + description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has fewer available replicas than desired for more than 5 minutes." + + diff --git a/k3s-monitoring/prometheus-deploy.yml b/k3s-monitoring/prometheus-deploy.yml new file mode 100644 index 0000000..ae12a35 --- /dev/null +++ b/k3s-monitoring/prometheus-deploy.yml @@ -0,0 +1,52 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + # 모니터링 워커 노드 한 곳에만 올리고 싶으면 유지 + nodeName: rpi-worker-monitor + containers: + - name: prometheus + image: prom/prometheus:latest + args: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--web.enable-lifecycle" + ports: + - containerPort: 9090 + volumeMounts: + - name: prometheus-config + mountPath: /etc/prometheus + - name: prometheus-data + mountPath: /prometheus + volumes: + - name: prometheus-config + configMap: + name: prometheus-config + - name: prometheus-data + emptyDir: {} # 테스트 용; 나중에 PVC로 바꿔도 됨 + +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: monitoring +spec: + selector: + app: prometheus + type: NodePort + ports: + - port: 9090 # ClusterIP 기준 + targetPort: 9090 + nodePort: 30100 # 외부에서 접근할 때 쓸 NodePort (30000~ 범위) diff --git a/kube-state-metrics/kube-state-metrics.yaml b/kube-state-metrics/kube-state-metrics.yaml new file mode 100644 index 0000000..bad9b8e --- /dev/null +++ b/kube-state-metrics/kube-state-metrics.yaml @@ -0,0 +1,99 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kube-state-metrics +rules: + - apiGroups: [""] + resources: + - nodes + - pods + - services + - namespaces + - persistentvolumeclaims + - persistentvolumes + - resourcequotas + - replicationcontrollers + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: + - deployments + - daemonsets + - statefulsets + - replicasets + verbs: ["get", "list", "watch"] + - apiGroups: ["batch"] + resources: + - jobs + - cronjobs + verbs: ["get", "list", "watch"] + - apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: + - kind: ServiceAccount + name: kube-state-metrics + namespace: kube-system +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: kube-system +spec: + replicas: 1 + selector: + matchLabels: + app: kube-state-metrics + template: + metadata: + labels: + app: kube-state-metrics + spec: + serviceAccountName: kube-state-metrics + containers: + - name: kube-state-metrics + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.10.0 + imagePullPolicy: IfNotPresent + ports: + - name: http-metrics + containerPort: 8080 + resources: + requests: + cpu: "50m" + memory: "80Mi" + limits: + cpu: "200m" + memory: "200Mi" +--- +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: kube-system + labels: + app: kube-state-metrics +spec: + type: ClusterIP + selector: + app: kube-state-metrics + ports: + - name: http-metrics + port: 8080 + targetPort: http-metrics