monitoring: алерты через Alertmanager → Telegram (node down/cpu/mem/disk)
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -2,3 +2,5 @@
|
||||
*.key
|
||||
*.pem
|
||||
secrets/
|
||||
alertmanager/telegram_token
|
||||
**/telegram_token
|
||||
|
||||
21
stacks/monitoring/alertmanager/alertmanager.yml
Normal file
21
stacks/monitoring/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,21 @@
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
receiver: telegram
|
||||
group_by: ['alertname', 'instance']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 3h
|
||||
|
||||
receivers:
|
||||
- name: telegram
|
||||
telegram_configs:
|
||||
- bot_token_file: /etc/alertmanager/telegram_token
|
||||
chat_id: 607015
|
||||
parse_mode: HTML
|
||||
send_resolved: true
|
||||
message: |
|
||||
{{ if eq .Status "firing" }}🔴{{ else }}✅{{ end }} <b>{{ .CommonLabels.alertname }}</b> [{{ .Status | toUpper }}]
|
||||
{{ range .Alerts }}{{ .Annotations.summary }}
|
||||
{{ end }}
|
||||
@@ -12,11 +12,25 @@ services:
|
||||
- '--web.enable-lifecycle'
|
||||
volumes:
|
||||
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./prometheus/targets:/etc/prometheus/targets:ro
|
||||
- ./prometheus/rules:/etc/prometheus/rules:ro
|
||||
- prometheus_data:/prometheus
|
||||
ports:
|
||||
- '127.0.0.1:9090:9090'
|
||||
networks: [monitoring]
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
container_name: alertmanager
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
- ./alertmanager/telegram_token:/etc/alertmanager/telegram_token:ro
|
||||
- alertmanager_data:/alertmanager
|
||||
ports:
|
||||
- '127.0.0.1:9093:9093'
|
||||
networks: [monitoring]
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
container_name: grafana
|
||||
@@ -69,6 +83,7 @@ services:
|
||||
volumes:
|
||||
prometheus_data:
|
||||
grafana_data:
|
||||
alertmanager_data:
|
||||
|
||||
networks:
|
||||
monitoring:
|
||||
|
||||
@@ -4,17 +4,23 @@ global:
|
||||
external_labels:
|
||||
monitor: ruzzy-infra
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/rules/*.yml
|
||||
|
||||
scrape_configs:
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Хосты: новый сервер = добавить файл в targets/node/<name>.yml + reload
|
||||
- job_name: node
|
||||
file_sd_configs:
|
||||
- files: ['/etc/prometheus/targets/node/*.yml']
|
||||
|
||||
# Контейнеры
|
||||
- job_name: cadvisor
|
||||
file_sd_configs:
|
||||
- files: ['/etc/prometheus/targets/cadvisor/*.yml']
|
||||
|
||||
37
stacks/monitoring/prometheus/rules/alerts.yml
Normal file
37
stacks/monitoring/prometheus/rules/alerts.yml
Normal file
@@ -0,0 +1,37 @@
|
||||
groups:
|
||||
- name: host
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels: {severity: critical}
|
||||
annotations:
|
||||
summary: "Таргет {{ $labels.job }} на {{ $labels.instance }} недоступен"
|
||||
|
||||
- alert: HostHighCPU
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
|
||||
for: 10m
|
||||
labels: {severity: warning}
|
||||
annotations:
|
||||
summary: "Высокий CPU на {{ $labels.instance }}: {{ printf \"%.0f\" $value }}%"
|
||||
|
||||
- alert: HostLowMemory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 5m
|
||||
labels: {severity: warning}
|
||||
annotations:
|
||||
summary: "Мало памяти на {{ $labels.instance }}: доступно {{ printf \"%.0f\" $value }}%"
|
||||
|
||||
- alert: HostDiskLow
|
||||
expr: node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs"} * 100 < 15
|
||||
for: 5m
|
||||
labels: {severity: warning}
|
||||
annotations:
|
||||
summary: "Кончается диск на {{ $labels.instance }} ({{ $labels.mountpoint }}): свободно {{ printf \"%.0f\" $value }}%"
|
||||
|
||||
- alert: HostDiskCritical
|
||||
expr: node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs"} * 100 < 5
|
||||
for: 5m
|
||||
labels: {severity: critical}
|
||||
annotations:
|
||||
summary: "КРИТИЧНО: диск почти полон на {{ $labels.instance }} ({{ $labels.mountpoint }}): свободно {{ printf \"%.0f\" $value }}%"
|
||||
Reference in New Issue
Block a user