monitoring: алерты через Alertmanager → Telegram (node down/cpu/mem/disk)
This commit is contained in:
@@ -4,17 +4,23 @@ global:
|
||||
external_labels:
|
||||
monitor: ruzzy-infra
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/rules/*.yml
|
||||
|
||||
scrape_configs:
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Хосты: новый сервер = добавить файл в targets/node/<name>.yml + reload
|
||||
- job_name: node
|
||||
file_sd_configs:
|
||||
- files: ['/etc/prometheus/targets/node/*.yml']
|
||||
|
||||
# Контейнеры
|
||||
- job_name: cadvisor
|
||||
file_sd_configs:
|
||||
- files: ['/etc/prometheus/targets/cadvisor/*.yml']
|
||||
|
||||
37
stacks/monitoring/prometheus/rules/alerts.yml
Normal file
37
stacks/monitoring/prometheus/rules/alerts.yml
Normal file
@@ -0,0 +1,37 @@
|
||||
groups:
|
||||
- name: host
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels: {severity: critical}
|
||||
annotations:
|
||||
summary: "Таргет {{ $labels.job }} на {{ $labels.instance }} недоступен"
|
||||
|
||||
- alert: HostHighCPU
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
|
||||
for: 10m
|
||||
labels: {severity: warning}
|
||||
annotations:
|
||||
summary: "Высокий CPU на {{ $labels.instance }}: {{ printf \"%.0f\" $value }}%"
|
||||
|
||||
- alert: HostLowMemory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 5m
|
||||
labels: {severity: warning}
|
||||
annotations:
|
||||
summary: "Мало памяти на {{ $labels.instance }}: доступно {{ printf \"%.0f\" $value }}%"
|
||||
|
||||
- alert: HostDiskLow
|
||||
expr: node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs"} * 100 < 15
|
||||
for: 5m
|
||||
labels: {severity: warning}
|
||||
annotations:
|
||||
summary: "Кончается диск на {{ $labels.instance }} ({{ $labels.mountpoint }}): свободно {{ printf \"%.0f\" $value }}%"
|
||||
|
||||
- alert: HostDiskCritical
|
||||
expr: node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs"} * 100 < 5
|
||||
for: 5m
|
||||
labels: {severity: critical}
|
||||
annotations:
|
||||
summary: "КРИТИЧНО: диск почти полон на {{ $labels.instance }} ({{ $labels.mountpoint }}): свободно {{ printf \"%.0f\" $value }}%"
|
||||
Reference in New Issue
Block a user