monitoring: алерты через Alertmanager → Telegram (node down/cpu/mem/disk)
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -2,3 +2,5 @@
|
|||||||
*.key
|
*.key
|
||||||
*.pem
|
*.pem
|
||||||
secrets/
|
secrets/
|
||||||
|
alertmanager/telegram_token
|
||||||
|
**/telegram_token
|
||||||
|
|||||||
21
stacks/monitoring/alertmanager/alertmanager.yml
Normal file
21
stacks/monitoring/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
global:
|
||||||
|
resolve_timeout: 5m
|
||||||
|
|
||||||
|
route:
|
||||||
|
receiver: telegram
|
||||||
|
group_by: ['alertname', 'instance']
|
||||||
|
group_wait: 30s
|
||||||
|
group_interval: 5m
|
||||||
|
repeat_interval: 3h
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: telegram
|
||||||
|
telegram_configs:
|
||||||
|
- bot_token_file: /etc/alertmanager/telegram_token
|
||||||
|
chat_id: 607015
|
||||||
|
parse_mode: HTML
|
||||||
|
send_resolved: true
|
||||||
|
message: |
|
||||||
|
{{ if eq .Status "firing" }}🔴{{ else }}✅{{ end }} <b>{{ .CommonLabels.alertname }}</b> [{{ .Status | toUpper }}]
|
||||||
|
{{ range .Alerts }}{{ .Annotations.summary }}
|
||||||
|
{{ end }}
|
||||||
@@ -12,11 +12,25 @@ services:
|
|||||||
- '--web.enable-lifecycle'
|
- '--web.enable-lifecycle'
|
||||||
volumes:
|
volumes:
|
||||||
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
|
- ./prometheus/targets:/etc/prometheus/targets:ro
|
||||||
|
- ./prometheus/rules:/etc/prometheus/rules:ro
|
||||||
- prometheus_data:/prometheus
|
- prometheus_data:/prometheus
|
||||||
ports:
|
ports:
|
||||||
- '127.0.0.1:9090:9090'
|
- '127.0.0.1:9090:9090'
|
||||||
networks: [monitoring]
|
networks: [monitoring]
|
||||||
|
|
||||||
|
alertmanager:
|
||||||
|
image: prom/alertmanager:latest
|
||||||
|
container_name: alertmanager
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||||
|
- ./alertmanager/telegram_token:/etc/alertmanager/telegram_token:ro
|
||||||
|
- alertmanager_data:/alertmanager
|
||||||
|
ports:
|
||||||
|
- '127.0.0.1:9093:9093'
|
||||||
|
networks: [monitoring]
|
||||||
|
|
||||||
grafana:
|
grafana:
|
||||||
image: grafana/grafana:latest
|
image: grafana/grafana:latest
|
||||||
container_name: grafana
|
container_name: grafana
|
||||||
@@ -69,6 +83,7 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
prometheus_data:
|
prometheus_data:
|
||||||
grafana_data:
|
grafana_data:
|
||||||
|
alertmanager_data:
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
monitoring:
|
monitoring:
|
||||||
|
|||||||
@@ -4,17 +4,23 @@ global:
|
|||||||
external_labels:
|
external_labels:
|
||||||
monitor: ruzzy-infra
|
monitor: ruzzy-infra
|
||||||
|
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets: ['alertmanager:9093']
|
||||||
|
|
||||||
|
rule_files:
|
||||||
|
- /etc/prometheus/rules/*.yml
|
||||||
|
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
- job_name: prometheus
|
- job_name: prometheus
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ['localhost:9090']
|
- targets: ['localhost:9090']
|
||||||
|
|
||||||
# Хосты: новый сервер = добавить файл в targets/node/<name>.yml + reload
|
|
||||||
- job_name: node
|
- job_name: node
|
||||||
file_sd_configs:
|
file_sd_configs:
|
||||||
- files: ['/etc/prometheus/targets/node/*.yml']
|
- files: ['/etc/prometheus/targets/node/*.yml']
|
||||||
|
|
||||||
# Контейнеры
|
|
||||||
- job_name: cadvisor
|
- job_name: cadvisor
|
||||||
file_sd_configs:
|
file_sd_configs:
|
||||||
- files: ['/etc/prometheus/targets/cadvisor/*.yml']
|
- files: ['/etc/prometheus/targets/cadvisor/*.yml']
|
||||||
|
|||||||
37
stacks/monitoring/prometheus/rules/alerts.yml
Normal file
37
stacks/monitoring/prometheus/rules/alerts.yml
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
groups:
|
||||||
|
- name: host
|
||||||
|
rules:
|
||||||
|
- alert: TargetDown
|
||||||
|
expr: up == 0
|
||||||
|
for: 2m
|
||||||
|
labels: {severity: critical}
|
||||||
|
annotations:
|
||||||
|
summary: "Таргет {{ $labels.job }} на {{ $labels.instance }} недоступен"
|
||||||
|
|
||||||
|
- alert: HostHighCPU
|
||||||
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
|
||||||
|
for: 10m
|
||||||
|
labels: {severity: warning}
|
||||||
|
annotations:
|
||||||
|
summary: "Высокий CPU на {{ $labels.instance }}: {{ printf \"%.0f\" $value }}%"
|
||||||
|
|
||||||
|
- alert: HostLowMemory
|
||||||
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||||
|
for: 5m
|
||||||
|
labels: {severity: warning}
|
||||||
|
annotations:
|
||||||
|
summary: "Мало памяти на {{ $labels.instance }}: доступно {{ printf \"%.0f\" $value }}%"
|
||||||
|
|
||||||
|
- alert: HostDiskLow
|
||||||
|
expr: node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs"} * 100 < 15
|
||||||
|
for: 5m
|
||||||
|
labels: {severity: warning}
|
||||||
|
annotations:
|
||||||
|
summary: "Кончается диск на {{ $labels.instance }} ({{ $labels.mountpoint }}): свободно {{ printf \"%.0f\" $value }}%"
|
||||||
|
|
||||||
|
- alert: HostDiskCritical
|
||||||
|
expr: node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs"} * 100 < 5
|
||||||
|
for: 5m
|
||||||
|
labels: {severity: critical}
|
||||||
|
annotations:
|
||||||
|
summary: "КРИТИЧНО: диск почти полон на {{ $labels.instance }} ({{ $labels.mountpoint }}): свободно {{ printf \"%.0f\" $value }}%"
|
||||||
Reference in New Issue
Block a user