Master-Node Stack & README
This commit is contained in:
32
k3s-monitoring/alertmanager-config.yml
Normal file
32
k3s-monitoring/alertmanager-config.yml
Normal file
@@ -0,0 +1,32 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alertmanager-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
alertmanager.yml: |
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
receiver: 'default'
|
||||
group_by: ['alertname', 'instance']
|
||||
group_wait: 30s
|
||||
group_interval: 2m
|
||||
repeat_interval: 4h
|
||||
|
||||
routes:
|
||||
- receiver: 'n8n-webhook'
|
||||
matchers:
|
||||
- 'severity="critical"'
|
||||
group_wait: 10s
|
||||
group_interval: 1m
|
||||
repeat_interval: 2h
|
||||
|
||||
receivers:
|
||||
- name: 'default'
|
||||
|
||||
- name: 'n8n-webhook'
|
||||
webhook_configs:
|
||||
- url: 'http://<IP>/n8n/webhook/alert'
|
||||
send_resolved: true
|
||||
50
k3s-monitoring/alertmanager-deploy.yml
Normal file
50
k3s-monitoring/alertmanager-deploy.yml
Normal file
@@ -0,0 +1,50 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: alertmanager
|
||||
namespace: monitoring
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: alertmanager
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
nodeName: rpi-worker-monitor
|
||||
containers:
|
||||
- name: alertmanager
|
||||
image: prom/alertmanager:latest
|
||||
args:
|
||||
- "--config.file=/etc/alertmanager/alertmanager.yml"
|
||||
- "--storage.path=/alertmanager"
|
||||
- "--log.level=debug"
|
||||
ports:
|
||||
- containerPort: 9093
|
||||
volumeMounts:
|
||||
- name: alertmanager-config
|
||||
mountPath: /etc/alertmanager
|
||||
- name: alertmanager-data
|
||||
mountPath: /alertmanager
|
||||
volumes:
|
||||
- name: alertmanager-config
|
||||
configMap:
|
||||
name: alertmanager-config
|
||||
- name: alertmanager-data
|
||||
emptyDir: {}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: alertmanager
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
app: alertmanager
|
||||
type: NodePort
|
||||
ports:
|
||||
- port: 9093
|
||||
targetPort: 9093
|
||||
nodePort: 30101
|
||||
18
k3s-monitoring/grafana-config.yml
Normal file
18
k3s-monitoring/grafana-config.yml
Normal file
@@ -0,0 +1,18 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-datasources
|
||||
namespace: monitoring
|
||||
data:
|
||||
datasources.yml: |
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus.monitoring.svc.cluster.local:9090
|
||||
isDefault: true
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki.monitoring.svc.cluster.local:3100
|
||||
51
k3s-monitoring/grafana-deploy.yml
Normal file
51
k3s-monitoring/grafana-deploy.yml
Normal file
@@ -0,0 +1,51 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: grafana
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: grafana
|
||||
spec:
|
||||
nodeName: rpi-worker-monitor
|
||||
containers:
|
||||
- name: grafana
|
||||
image: grafana/grafana:latest
|
||||
ports:
|
||||
- containerPort: 3000
|
||||
env:
|
||||
- name: GF_SECURITY_ADMIN_USER
|
||||
value: "admin"
|
||||
- name: GF_SECURITY_ADMIN_PASSWORD
|
||||
value: "admin1234" # 나중에 바꾸기!
|
||||
volumeMounts:
|
||||
- name: grafana-storage
|
||||
mountPath: /var/lib/grafana
|
||||
- name: grafana-datasources
|
||||
mountPath: /etc/grafana/provisioning/datasources
|
||||
volumes:
|
||||
- name: grafana-storage
|
||||
emptyDir: {}
|
||||
- name: grafana-datasources
|
||||
configMap:
|
||||
name: grafana-datasources
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
app: grafana
|
||||
type: NodePort
|
||||
ports:
|
||||
- port: 3000
|
||||
targetPort: 3000
|
||||
nodePort: 30103
|
||||
65
k3s-monitoring/loki-config.yml
Normal file
65
k3s-monitoring/loki-config.yml
Normal file
@@ -0,0 +1,65 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: loki-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
loki-config.yml: |
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
instance_addr: 127.0.0.1
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
ingester:
|
||||
lifecycler:
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
replication_factor: 1
|
||||
chunk_idle_period: 5m
|
||||
chunk_block_size: 262144
|
||||
chunk_target_size: 1048576
|
||||
chunk_retain_period: 1m
|
||||
max_transfer_retries: 0
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: boltdb-shipper
|
||||
object_store: filesystem
|
||||
schema: v11
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
storage_config:
|
||||
boltdb_shipper:
|
||||
active_index_directory: /loki/index
|
||||
shared_store: filesystem
|
||||
|
||||
limits_config:
|
||||
ingestion_rate_mb: 10
|
||||
ingestion_burst_size_mb: 20
|
||||
max_cache_freshness_per_query: 10m
|
||||
|
||||
chunk_store_config:
|
||||
max_look_back_period: 0
|
||||
|
||||
query_range:
|
||||
align_queries_with_step: true
|
||||
max_retries: 5
|
||||
|
||||
frontend:
|
||||
max_outstanding_per_tenant: 1024
|
||||
48
k3s-monitoring/loki-deploy.yml
Normal file
48
k3s-monitoring/loki-deploy.yml
Normal file
@@ -0,0 +1,48 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: monitoring
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: loki
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: loki
|
||||
spec:
|
||||
nodeName: rpi-worker-monitor
|
||||
containers:
|
||||
- name: loki
|
||||
image: grafana/loki:2.9.0 # 문서에서 쓰던 버전으로 맞추면 됨
|
||||
args:
|
||||
- "-config.file=/etc/loki/loki-config.yml"
|
||||
ports:
|
||||
- containerPort: 3100
|
||||
volumeMounts:
|
||||
- name: loki-config
|
||||
mountPath: /etc/loki
|
||||
- name: loki-data
|
||||
mountPath: /loki
|
||||
volumes:
|
||||
- name: loki-config
|
||||
configMap:
|
||||
name: loki-config
|
||||
- name: loki-data
|
||||
emptyDir: {}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
app: loki
|
||||
type: NodePort
|
||||
ports:
|
||||
- port: 3100
|
||||
targetPort: 3100
|
||||
nodePort: 30102 # ★ promtail이 접근할 NodePort
|
||||
162
k3s-monitoring/prometheus-config.yml
Normal file
162
k3s-monitoring/prometheus-config.yml
Normal file
@@ -0,0 +1,162 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
prometheus.yml: |
|
||||
global:
|
||||
scrape_interval: 30s
|
||||
evaluation_interval: 30s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['prometheus:9090']
|
||||
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets:
|
||||
- '100.104.27.47:9100'
|
||||
- '100.77.135.86:9100'
|
||||
- '100.90.177.14:9100'
|
||||
|
||||
- job_name: 'kube-state-metrics'
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'kube-state-metrics.kube-system.svc.cluster.local:8080'
|
||||
labels:
|
||||
service: 'kube-state-metrics'
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
rule_files:
|
||||
- "/etc/prometheus/alert.rules.yml"
|
||||
alert.rules.yml: |
|
||||
groups:
|
||||
# 1) 노드 상태 및 리소스 사용 관련 알람
|
||||
- name: node-health-rules
|
||||
rules:
|
||||
# ---- 노드 장애 / 다운 감지 ----
|
||||
- alert: NodeDown # 규칙 이름
|
||||
expr: up{job="node-exporter"} == 0
|
||||
# node-exporter가 응답하지 않으면 0으로 떨어짐
|
||||
for: 1m # 1분 연속으로 다운일 때 Alert
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Node {{ $labels.instance }} is down"
|
||||
description: "No response from node-exporter on {{ $labels.instance }} for more than 1 minute."
|
||||
|
||||
# ---- CPU 사용률 과도 ----
|
||||
- alert: HighCpuLoad
|
||||
expr: avg by (instance) (rate(node_cpu_seconds_total{mode!="idle",job="node-exporter"}[5m])) > 0.7
|
||||
# 최근 1분 동안 idle 이 아닌 CPU 비율의 평균이 70% 초과
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ $labels.instance }}"
|
||||
description: "Average CPU usage over 5 minutes is above 70% on {{ $labels.instance }}."
|
||||
|
||||
# ---- 메모리 사용률 과도 ----
|
||||
- alert: HighMemoryUsage
|
||||
expr: 1 - (node_memory_MemAvailable_bytes{job="node-exporter"}
|
||||
/ node_memory_MemTotal_bytes{job="node-exporter"}) > 0.8
|
||||
# 사용률 = 1 - (사용 가능 / 전체) > 0.8 → 80% 이상 사용
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is above 80% for more than 5 minutes on {{ $labels.instance }}."
|
||||
|
||||
# ---- 디스크 용량 부족 ----
|
||||
- alert: DiskAlmostFull
|
||||
expr: 1 - (
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!~"tmpfs|overlay"}
|
||||
/ node_filesystem_size_bytes{job="node-exporter",fstype!~"tmpfs|overlay"}
|
||||
) > 0.4
|
||||
# overlay, tmpfs 같은 일회성/메모리 파일시스템 제외
|
||||
# 사용률 > 85% 일 때 알람
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Disk almost full on {{ $labels.instance }} ({{ $labels.mountpoint }})"
|
||||
description: "Filesystem {{ $labels.mountpoint }} is above 40% usage for more than 1 minutes on {{ $labels.instance }}."
|
||||
|
||||
# ---- 시스템 Load (코어 대비 과부하) ----
|
||||
- alert: HighSystemLoad
|
||||
expr: (
|
||||
avg by (instance) (node_load5{job="node-exporter"})
|
||||
/
|
||||
on (instance) count by (instance) (node_cpu_seconds_total{job="node-exporter",mode="idle"})
|
||||
) > 2
|
||||
# 1분 load average / CPU 코어 수 > 2 → 코어 수 대비 부하가 심한 경우
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High load average on {{ $labels.instance }}"
|
||||
description: "5-minute load average per CPU core is above 2 for more than 10 minutes on {{ $labels.instance }}."
|
||||
|
||||
# 2) Kubernetes 워크로드 상태 관련 알람 (kube-state-metrics 기준)
|
||||
- name: kubernetes-workload-rules
|
||||
rules:
|
||||
# ---- K8s 노드가 NotReady 상태 ----
|
||||
- alert: KubeNodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true",job="kube-state-metrics"} == 0
|
||||
# Ready=true 인 시리즈가 0이면, 사실상 NotReady
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Kubernetes node {{ $labels.node }} is NotReady"
|
||||
description: "Kubernetes node {{ $labels.node }} has been in NotReady state for more than 5 minutes."
|
||||
|
||||
# ---- Pod가 CrashLooping (재시작 반복) ----
|
||||
- alert: PodCrashLooping
|
||||
expr: increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) > 3
|
||||
# 5분 동안 같은 컨테이너가 3번 이상 재시작
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
|
||||
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} restarted more than 3 times in the last 5 minutes."
|
||||
|
||||
# ---- Running인데 Ready가 아닌 Pod ----
|
||||
- alert: PodNotReady
|
||||
expr: (
|
||||
kube_pod_status_ready{condition="true",job="kube-state-metrics"} == 0
|
||||
)
|
||||
and on (namespace, pod)
|
||||
(
|
||||
kube_pod_status_phase{phase="Running",job="kube-state-metrics"} == 1
|
||||
)
|
||||
# Running 상태인데 Ready = false 인 경우
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not Ready"
|
||||
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is Running but not Ready for more than 10 minutes."
|
||||
|
||||
# ---- Deployment: 원하는 replicas와 실제 가용 replicas 불일치 ----
|
||||
- alert: DeploymentReplicasMismatch
|
||||
expr: kube_deployment_status_replicas_available{job="kube-state-metrics"}
|
||||
< kube_deployment_spec_replicas{job="kube-state-metrics"}
|
||||
# 가용 replicas < 설정 replicas → 정상적인 replica 수 유지 실패
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has unavailable replicas"
|
||||
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has fewer available replicas than desired for more than 5 minutes."
|
||||
|
||||
|
||||
52
k3s-monitoring/prometheus-deploy.yml
Normal file
52
k3s-monitoring/prometheus-deploy.yml
Normal file
@@ -0,0 +1,52 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: prometheus
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
# 모니터링 워커 노드 한 곳에만 올리고 싶으면 유지
|
||||
nodeName: rpi-worker-monitor
|
||||
containers:
|
||||
- name: prometheus
|
||||
image: prom/prometheus:latest
|
||||
args:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.path=/prometheus"
|
||||
- "--web.enable-lifecycle"
|
||||
ports:
|
||||
- containerPort: 9090
|
||||
volumeMounts:
|
||||
- name: prometheus-config
|
||||
mountPath: /etc/prometheus
|
||||
- name: prometheus-data
|
||||
mountPath: /prometheus
|
||||
volumes:
|
||||
- name: prometheus-config
|
||||
configMap:
|
||||
name: prometheus-config
|
||||
- name: prometheus-data
|
||||
emptyDir: {} # 테스트 용; 나중에 PVC로 바꿔도 됨
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
app: prometheus
|
||||
type: NodePort
|
||||
ports:
|
||||
- port: 9090 # ClusterIP 기준
|
||||
targetPort: 9090
|
||||
nodePort: 30100 # 외부에서 접근할 때 쓸 NodePort (30000~ 범위)
|
||||
Reference in New Issue
Block a user