kube-state-metrics
什么是 kube-state-metrics
kube-state-metrics 监听 K8s API,将集群对象状态转换为 Prometheus 指标。与 Metrics Server(资源使用量)不同,它关注的是对象的状态(副本数、条件、标签等)。
安装
bash
# 通常随 kube-prometheus-stack 一起安装
helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \
--set kubeStateMetrics.enabled=true关键指标
text
# Deployment 副本状态
kube_deployment_status_replicas_available
kube_deployment_status_replicas_unavailable
kube_deployment_spec_replicas
# Pod 状态
kube_pod_status_phase{phase="Running"}
kube_pod_status_ready{condition="true"}
kube_pod_container_status_restarts_total
# Node 状态
kube_node_status_condition{condition="Ready",status="true"}
kube_node_spec_unschedulable
# PVC 状态
kube_persistentvolumeclaim_status_phase{phase="Bound"}
# Job 状态
kube_job_status_succeeded
kube_job_status_failed
kube_job_complete实用告警规则
yaml
groups:
- name: kubernetes-apps
rules:
# Deployment 副本不足
- alert: KubeDeploymentReplicasMismatch
expr: |
kube_deployment_spec_replicas != kube_deployment_status_replicas_available
for: 15m
labels:
severity: warning
annotations:
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} 副本不足"
# Pod 频繁重启
- alert: KubePodCrashLooping
expr: |
rate(kube_pod_container_status_restarts_total[15m]) * 60 * 5 > 0
for: 15m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 频繁重启"
# PVC 未绑定
- alert: KubePersistentVolumeClaimPending
expr: |
kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} 未绑定"
# Node NotReady
- alert: KubeNodeNotReady
expr: |
kube_node_status_condition{condition="Ready",status="true"} == 0
for: 15m
labels:
severity: critical
annotations:
summary: "节点 {{ $labels.node }} 不可用"