Prometheus & Grafana 监控体系
架构概览
应用 Pod(暴露 /metrics)
│ Scrape(拉取)
▼
Prometheus Server(存储时序数据)
│ PromQL 查询
├── Grafana(可视化)
└── Alertmanager(告警)
│
├── Email / Slack / PagerDuty
└── ...Prometheus Operator
bash
# 使用 kube-prometheus-stack 一键安装
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \
--namespace monitoring \
--create-namespace \
--set grafana.adminPassword=admin123 \
--set prometheus.prometheusSpec.retention=30d \
--set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage=50GiServiceMonitor(自动发现监控目标)
yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: my-app-monitor
namespace: monitoring
labels:
release: kube-prometheus-stack # 匹配 Prometheus 的 serviceMonitorSelector
spec:
namespaceSelector:
matchNames:
- production
selector:
matchLabels:
app: my-app
endpoints:
- port: metrics
path: /metrics
interval: 30s
scrapeTimeout: 10s
# 基础认证
basicAuth:
username:
name: metrics-secret
key: username
password:
name: metrics-secret
key: password应用暴露指标(Go)
go
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var (
httpRequestsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "HTTP 请求总数",
},
[]string{"method", "path", "status"},
)
httpRequestDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP 请求延迟",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "path"},
)
activeConnections = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "active_connections",
Help: "当前活跃连接数",
},
)
)
// 中间件记录指标
func metricsMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
rw := &responseWriter{ResponseWriter: w}
next.ServeHTTP(rw, r)
duration := time.Since(start).Seconds()
httpRequestsTotal.WithLabelValues(r.Method, r.URL.Path, strconv.Itoa(rw.status)).Inc()
httpRequestDuration.WithLabelValues(r.Method, r.URL.Path).Observe(duration)
})
}
// 暴露 /metrics 端点
http.Handle("/metrics", promhttp.Handler())关键 PromQL 查询
text
# Pod CPU 使用率(%)
100 * sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, namespace)
/ sum(kube_pod_container_resource_limits{resource="cpu"}) by (pod, namespace)
# Pod 内存使用率(%)
100 * container_memory_working_set_bytes{container!=""}
/ kube_pod_container_resource_limits{resource="memory"}
# HTTP 请求 QPS
sum(rate(http_requests_total[5m])) by (service)
# HTTP P99 延迟
histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service))
# HTTP 错误率
sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
/ sum(rate(http_requests_total[5m])) by (service)
# 节点磁盘使用率
100 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100)
# Deployment 副本不足告警
kube_deployment_status_replicas_available < kube_deployment_spec_replicasPrometheusRule(告警规则)
yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: my-app-alerts
namespace: monitoring
labels:
release: kube-prometheus-stack
spec:
groups:
- name: my-app
interval: 30s
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
/ sum(rate(http_requests_total[5m])) by (service) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "服务 {{ $labels.service }} 错误率过高"
description: "错误率 {{ $value | humanizePercentage }},超过 5%"
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 频繁重启"Grafana Dashboard
bash
# 访问 Grafana
kubectl port-forward -n monitoring svc/kube-prometheus-stack-grafana 3000:80
# 常用 Dashboard ID(从 grafana.com 导入)
# 315 - Kubernetes cluster monitoring
# 6417 - Kubernetes Pods
# 1860 - Node Exporter Full
# 7249 - Kubernetes Cluster