monitoring: алерты через Alertmanager → Telegram (node down/cpu/mem/disk)

This commit is contained in:
Ruslan Gilfanov
2026-06-21 19:34:39 +03:00
parent 75d00cc400
commit b2d04e0573
5 changed files with 83 additions and 2 deletions

2
.gitignore vendored
View File

@@ -2,3 +2,5 @@
*.key
*.pem
secrets/
alertmanager/telegram_token
**/telegram_token

View File

@@ -0,0 +1,21 @@
global:
resolve_timeout: 5m
route:
receiver: telegram
group_by: ['alertname', 'instance']
group_wait: 30s
group_interval: 5m
repeat_interval: 3h
receivers:
- name: telegram
telegram_configs:
- bot_token_file: /etc/alertmanager/telegram_token
chat_id: 607015
parse_mode: HTML
send_resolved: true
message: |
{{ if eq .Status "firing" }}🔴{{ else }}✅{{ end }} <b>{{ .CommonLabels.alertname }}</b> [{{ .Status | toUpper }}]
{{ range .Alerts }}{{ .Annotations.summary }}
{{ end }}

View File

@@ -12,11 +12,25 @@ services:
- '--web.enable-lifecycle'
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./prometheus/targets:/etc/prometheus/targets:ro
- ./prometheus/rules:/etc/prometheus/rules:ro
- prometheus_data:/prometheus
ports:
- '127.0.0.1:9090:9090'
networks: [monitoring]
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: unless-stopped
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- ./alertmanager/telegram_token:/etc/alertmanager/telegram_token:ro
- alertmanager_data:/alertmanager
ports:
- '127.0.0.1:9093:9093'
networks: [monitoring]
grafana:
image: grafana/grafana:latest
container_name: grafana
@@ -69,6 +83,7 @@ services:
volumes:
prometheus_data:
grafana_data:
alertmanager_data:
networks:
monitoring:

View File

@@ -4,17 +4,23 @@ global:
external_labels:
monitor: ruzzy-infra
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ['localhost:9090']
# Хосты: новый сервер = добавить файл в targets/node/<name>.yml + reload
- job_name: node
file_sd_configs:
- files: ['/etc/prometheus/targets/node/*.yml']
# Контейнеры
- job_name: cadvisor
file_sd_configs:
- files: ['/etc/prometheus/targets/cadvisor/*.yml']

View File

@@ -0,0 +1,37 @@
groups:
- name: host
rules:
- alert: TargetDown
expr: up == 0
for: 2m
labels: {severity: critical}
annotations:
summary: "Таргет {{ $labels.job }} на {{ $labels.instance }} недоступен"
- alert: HostHighCPU
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 10m
labels: {severity: warning}
annotations:
summary: "Высокий CPU на {{ $labels.instance }}: {{ printf \"%.0f\" $value }}%"
- alert: HostLowMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels: {severity: warning}
annotations:
summary: "Мало памяти на {{ $labels.instance }}: доступно {{ printf \"%.0f\" $value }}%"
- alert: HostDiskLow
expr: node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs"} * 100 < 15
for: 5m
labels: {severity: warning}
annotations:
summary: "Кончается диск на {{ $labels.instance }} ({{ $labels.mountpoint }}): свободно {{ printf \"%.0f\" $value }}%"
- alert: HostDiskCritical
expr: node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs"} * 100 < 5
for: 5m
labels: {severity: critical}
annotations:
summary: "КРИТИЧНО: диск почти полон на {{ $labels.instance }} ({{ $labels.mountpoint }}): свободно {{ printf \"%.0f\" $value }}%"