From b2d04e057328df53e3d33acd518ae07c4071e1d2 Mon Sep 17 00:00:00 2001 From: Ruslan Gilfanov Date: Sun, 21 Jun 2026 19:34:39 +0300 Subject: [PATCH] =?UTF-8?q?monitoring:=20=D0=B0=D0=BB=D0=B5=D1=80=D1=82?= =?UTF-8?q?=D1=8B=20=D1=87=D0=B5=D1=80=D0=B5=D0=B7=20Alertmanager=20?= =?UTF-8?q?=E2=86=92=20Telegram=20(node=20down/cpu/mem/disk)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 + .../monitoring/alertmanager/alertmanager.yml | 21 +++++++++++ stacks/monitoring/docker-compose.yml | 15 ++++++++ stacks/monitoring/prometheus/prometheus.yml | 10 ++++- stacks/monitoring/prometheus/rules/alerts.yml | 37 +++++++++++++++++++ 5 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 stacks/monitoring/alertmanager/alertmanager.yml create mode 100644 stacks/monitoring/prometheus/rules/alerts.yml diff --git a/.gitignore b/.gitignore index 63af063..ceadc51 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ *.key *.pem secrets/ +alertmanager/telegram_token +**/telegram_token diff --git a/stacks/monitoring/alertmanager/alertmanager.yml b/stacks/monitoring/alertmanager/alertmanager.yml new file mode 100644 index 0000000..217fb33 --- /dev/null +++ b/stacks/monitoring/alertmanager/alertmanager.yml @@ -0,0 +1,21 @@ +global: + resolve_timeout: 5m + +route: + receiver: telegram + group_by: ['alertname', 'instance'] + group_wait: 30s + group_interval: 5m + repeat_interval: 3h + +receivers: + - name: telegram + telegram_configs: + - bot_token_file: /etc/alertmanager/telegram_token + chat_id: 607015 + parse_mode: HTML + send_resolved: true + message: | + {{ if eq .Status "firing" }}🔴{{ else }}✅{{ end }} {{ .CommonLabels.alertname }} [{{ .Status | toUpper }}] + {{ range .Alerts }}{{ .Annotations.summary }} + {{ end }} diff --git a/stacks/monitoring/docker-compose.yml b/stacks/monitoring/docker-compose.yml index 3f351e3..7ad6849 100644 --- a/stacks/monitoring/docker-compose.yml +++ b/stacks/monitoring/docker-compose.yml @@ -12,11 +12,25 @@ services: - '--web.enable-lifecycle' volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./prometheus/targets:/etc/prometheus/targets:ro + - ./prometheus/rules:/etc/prometheus/rules:ro - prometheus_data:/prometheus ports: - '127.0.0.1:9090:9090' networks: [monitoring] + alertmanager: + image: prom/alertmanager:latest + container_name: alertmanager + restart: unless-stopped + volumes: + - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + - ./alertmanager/telegram_token:/etc/alertmanager/telegram_token:ro + - alertmanager_data:/alertmanager + ports: + - '127.0.0.1:9093:9093' + networks: [monitoring] + grafana: image: grafana/grafana:latest container_name: grafana @@ -69,6 +83,7 @@ services: volumes: prometheus_data: grafana_data: + alertmanager_data: networks: monitoring: diff --git a/stacks/monitoring/prometheus/prometheus.yml b/stacks/monitoring/prometheus/prometheus.yml index 7a54a3f..1b5cc7c 100644 --- a/stacks/monitoring/prometheus/prometheus.yml +++ b/stacks/monitoring/prometheus/prometheus.yml @@ -4,17 +4,23 @@ global: external_labels: monitor: ruzzy-infra +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + +rule_files: + - /etc/prometheus/rules/*.yml + scrape_configs: - job_name: prometheus static_configs: - targets: ['localhost:9090'] - # Хосты: новый сервер = добавить файл в targets/node/.yml + reload - job_name: node file_sd_configs: - files: ['/etc/prometheus/targets/node/*.yml'] - # Контейнеры - job_name: cadvisor file_sd_configs: - files: ['/etc/prometheus/targets/cadvisor/*.yml'] diff --git a/stacks/monitoring/prometheus/rules/alerts.yml b/stacks/monitoring/prometheus/rules/alerts.yml new file mode 100644 index 0000000..2cd419e --- /dev/null +++ b/stacks/monitoring/prometheus/rules/alerts.yml @@ -0,0 +1,37 @@ +groups: + - name: host + rules: + - alert: TargetDown + expr: up == 0 + for: 2m + labels: {severity: critical} + annotations: + summary: "Таргет {{ $labels.job }} на {{ $labels.instance }} недоступен" + + - alert: HostHighCPU + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90 + for: 10m + labels: {severity: warning} + annotations: + summary: "Высокий CPU на {{ $labels.instance }}: {{ printf \"%.0f\" $value }}%" + + - alert: HostLowMemory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 5m + labels: {severity: warning} + annotations: + summary: "Мало памяти на {{ $labels.instance }}: доступно {{ printf \"%.0f\" $value }}%" + + - alert: HostDiskLow + expr: node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs"} * 100 < 15 + for: 5m + labels: {severity: warning} + annotations: + summary: "Кончается диск на {{ $labels.instance }} ({{ $labels.mountpoint }}): свободно {{ printf \"%.0f\" $value }}%" + + - alert: HostDiskCritical + expr: node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs"} * 100 < 5 + for: 5m + labels: {severity: critical} + annotations: + summary: "КРИТИЧНО: диск почти полон на {{ $labels.instance }} ({{ $labels.mountpoint }}): свободно {{ printf \"%.0f\" $value }}%"