Alertmanager 告警管理
告警流程
Prometheus 评估告警规则
│ 触发告警
▼
Alertmanager
├── 分组(Group):合并相似告警
├── 抑制(Inhibit):高优先级告警抑制低优先级
├── 静默(Silence):维护窗口期间静默
└── 路由(Route):发送到不同接收器
├── Email
├── Slack
├── PagerDuty
└── WebhookAlertmanager 配置
yaml
# alertmanager.yaml
global:
resolve_timeout: 5m
smtp_smarthost: smtp.example.com:587
smtp_from: alertmanager@example.com
smtp_auth_username: alertmanager
smtp_auth_password: password
route:
group_by: [alertname, cluster, namespace]
group_wait: 30s # 等待同组其他告警
group_interval: 5m # 同组告警发送间隔
repeat_interval: 4h # 重复发送间隔
receiver: default
routes:
# 严重告警立即通知 PagerDuty
- match:
severity: critical
receiver: pagerduty
group_wait: 0s
repeat_interval: 1h
# 警告发送到 Slack
- match:
severity: warning
receiver: slack
group_wait: 1m
# 数据库告警发送给 DBA 团队
- match_re:
alertname: ^(MySQL|PostgreSQL|Redis).*
receiver: dba-team
receivers:
- name: default
email_configs:
- to: ops@example.com
- name: pagerduty
pagerduty_configs:
- service_key: <pagerduty-service-key>
description: '{{ template "pagerduty.default.description" . }}'
- name: slack
slack_configs:
- api_url: https://hooks.slack.com/services/xxx/yyy/zzz
channel: '#alerts'
title: '{{ template "slack.default.title" . }}'
text: '{{ template "slack.default.text" . }}'
send_resolved: true
- name: dba-team
email_configs:
- to: dba@example.com
slack_configs:
- channel: '#dba-alerts'
inhibit_rules:
# 集群宕机时抑制节点告警
- source_match:
alertname: ClusterDown
target_match_re:
alertname: ^Node.*
equal: [cluster]告警规则最佳实践
yaml
groups:
- name: slo-alerts
rules:
# SLO 告警:错误率超过 1%
- alert: ErrorRateTooHigh
expr: |
(
sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
/
sum(rate(http_requests_total[5m])) by (service)
) > 0.01
for: 5m
labels:
severity: critical
team: backend
annotations:
summary: "{{ $labels.service }} 错误率过高"
description: "服务 {{ $labels.service }} 的错误率为 {{ $value | humanizePercentage }},超过 SLO 阈值 1%"
runbook: "https://wiki.example.com/runbooks/high-error-rate"
dashboard: "https://grafana.example.com/d/xxx"静默(Silence)
bash
# 创建静默(维护窗口)
amtool silence add \
--alertmanager.url=http://alertmanager:9093 \
--author="ops-team" \
--comment="计划维护窗口" \
--duration=2h \
alertname=~".*" namespace="production"
# 查看静默
amtool silence query
# 删除静默
amtool silence expire <silence-id>