diff --git a/.gitignore b/.gitignore
index 63af063..ceadc51 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,5 @@
*.key
*.pem
secrets/
+alertmanager/telegram_token
+**/telegram_token
diff --git a/stacks/monitoring/alertmanager/alertmanager.yml b/stacks/monitoring/alertmanager/alertmanager.yml
new file mode 100644
index 0000000..217fb33
--- /dev/null
+++ b/stacks/monitoring/alertmanager/alertmanager.yml
@@ -0,0 +1,21 @@
+global:
+ resolve_timeout: 5m
+
+route:
+ receiver: telegram
+ group_by: ['alertname', 'instance']
+ group_wait: 30s
+ group_interval: 5m
+ repeat_interval: 3h
+
+receivers:
+ - name: telegram
+ telegram_configs:
+ - bot_token_file: /etc/alertmanager/telegram_token
+ chat_id: 607015
+ parse_mode: HTML
+ send_resolved: true
+ message: |
+ {{ if eq .Status "firing" }}🔴{{ else }}✅{{ end }} {{ .CommonLabels.alertname }} [{{ .Status | toUpper }}]
+ {{ range .Alerts }}{{ .Annotations.summary }}
+ {{ end }}
diff --git a/stacks/monitoring/docker-compose.yml b/stacks/monitoring/docker-compose.yml
index 3f351e3..7ad6849 100644
--- a/stacks/monitoring/docker-compose.yml
+++ b/stacks/monitoring/docker-compose.yml
@@ -12,11 +12,25 @@ services:
- '--web.enable-lifecycle'
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+ - ./prometheus/targets:/etc/prometheus/targets:ro
+ - ./prometheus/rules:/etc/prometheus/rules:ro
- prometheus_data:/prometheus
ports:
- '127.0.0.1:9090:9090'
networks: [monitoring]
+ alertmanager:
+ image: prom/alertmanager:latest
+ container_name: alertmanager
+ restart: unless-stopped
+ volumes:
+ - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+ - ./alertmanager/telegram_token:/etc/alertmanager/telegram_token:ro
+ - alertmanager_data:/alertmanager
+ ports:
+ - '127.0.0.1:9093:9093'
+ networks: [monitoring]
+
grafana:
image: grafana/grafana:latest
container_name: grafana
@@ -69,6 +83,7 @@ services:
volumes:
prometheus_data:
grafana_data:
+ alertmanager_data:
networks:
monitoring:
diff --git a/stacks/monitoring/prometheus/prometheus.yml b/stacks/monitoring/prometheus/prometheus.yml
index 7a54a3f..1b5cc7c 100644
--- a/stacks/monitoring/prometheus/prometheus.yml
+++ b/stacks/monitoring/prometheus/prometheus.yml
@@ -4,17 +4,23 @@ global:
external_labels:
monitor: ruzzy-infra
+alerting:
+ alertmanagers:
+ - static_configs:
+ - targets: ['alertmanager:9093']
+
+rule_files:
+ - /etc/prometheus/rules/*.yml
+
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ['localhost:9090']
- # Хосты: новый сервер = добавить файл в targets/node/.yml + reload
- job_name: node
file_sd_configs:
- files: ['/etc/prometheus/targets/node/*.yml']
- # Контейнеры
- job_name: cadvisor
file_sd_configs:
- files: ['/etc/prometheus/targets/cadvisor/*.yml']
diff --git a/stacks/monitoring/prometheus/rules/alerts.yml b/stacks/monitoring/prometheus/rules/alerts.yml
new file mode 100644
index 0000000..2cd419e
--- /dev/null
+++ b/stacks/monitoring/prometheus/rules/alerts.yml
@@ -0,0 +1,37 @@
+groups:
+ - name: host
+ rules:
+ - alert: TargetDown
+ expr: up == 0
+ for: 2m
+ labels: {severity: critical}
+ annotations:
+ summary: "Таргет {{ $labels.job }} на {{ $labels.instance }} недоступен"
+
+ - alert: HostHighCPU
+ expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
+ for: 10m
+ labels: {severity: warning}
+ annotations:
+ summary: "Высокий CPU на {{ $labels.instance }}: {{ printf \"%.0f\" $value }}%"
+
+ - alert: HostLowMemory
+ expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+ for: 5m
+ labels: {severity: warning}
+ annotations:
+ summary: "Мало памяти на {{ $labels.instance }}: доступно {{ printf \"%.0f\" $value }}%"
+
+ - alert: HostDiskLow
+ expr: node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs"} * 100 < 15
+ for: 5m
+ labels: {severity: warning}
+ annotations:
+ summary: "Кончается диск на {{ $labels.instance }} ({{ $labels.mountpoint }}): свободно {{ printf \"%.0f\" $value }}%"
+
+ - alert: HostDiskCritical
+ expr: node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs"} * 100 < 5
+ for: 5m
+ labels: {severity: critical}
+ annotations:
+ summary: "КРИТИЧНО: диск почти полон на {{ $labels.instance }} ({{ $labels.mountpoint }}): свободно {{ printf \"%.0f\" $value }}%"