SRE 每日主题:Prometheus + Grafana 监控体系
日期: 2026-03-10
主题序号: 10 (10 % 12 = 10)
难度: 中高级
适用场景: 生产环境监控体系建设
目录
架构概述
┌─────────────────────────────────────────────────────────────────┐
│ 监控体系架构 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ Exporters │───▶│ Prometheus │───▶│ Grafana │ │
│ │ (节点/应用) │ │ (存储/查询) │ │ (可视化) │ │
│ └──────────────┘ └──────────────┘ └──────────────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ Node Exporter│ │ Alertmanager │ │ Alerting │ │
│ │ Kube State │ │ (告警路由) │ │ (钉钉/企微) │ │
│ └──────────────┘ └──────────────┘ └──────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
Prometheus 部署与配置
1.1 Kubernetes 部署 (推荐)
# prometheus-deployment.yaml
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
labels:
name: monitoring
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_timeout: 10s
external_labels:
monitor: 'production'
environment: 'prod'
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
# Prometheus 自监控
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Kubernetes API Server
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Kubernetes Nodes
- job_name: 'kubernetes-nodes'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
# Kubernetes Pods
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: kubernetes_pod_name
# Node Exporter
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_endpoints_name]
regex: 'node-exporter'
action: keep
# Kube State Metrics
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['kube-state-metrics:8080']
# 业务应用监控 (示例)
- job_name: 'app-metrics'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: .+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9090"
spec:
serviceAccountName: prometheus
containers:
- name: prometheus
image: prom/prometheus:v2.52.0
ports:
- containerPort: 9090
name: http
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time=15d"
- "--storage.tsdb.retention.size=10GB"
- "--web.enable-lifecycle"
- "--web.enable-admin-api"
- "--query.max-concurrency=40"
- "--query.timeout=2m"
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "4Gi"
cpu: "2000m"
volumeMounts:
- name: config-volume
mountPath: /etc/prometheus
- name: rules-volume
mountPath: /etc/prometheus/rules
- name: data-volume
mountPath: /prometheus
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 30
periodSeconds: 15
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 30
periodSeconds: 15
volumes:
- name: config-volume
configMap:
name: prometheus-config
- name: rules-volume
configMap:
name: prometheus-rules
- name: data-volume
persistentVolumeClaim:
claimName: prometheus-pvc
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-pvc
namespace: monitoring
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 50Gi
storageClassName: local-path
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
type: ClusterIP
ports:
- port: 9090
targetPort: 9090
name: http
selector:
app: prometheus
1.2 Docker Compose 部署 (单机/测试)
# docker-compose.yml
version: '3.8'
services:
prometheus:
image: prom/prometheus:v2.52.0
container_name: prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./rules:/etc/prometheus/rules:ro
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=15d'
- '--storage.tsdb.retention.size=10GB'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
restart: unless-stopped
networks:
- monitoring
grafana:
image: grafana/grafana:10.4.0
container_name: grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=SecurePassword123!
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-piechart-panel
- GF_SERVER_ROOT_URL=http://grafana:3000
- GF_AUTH_ANONYMOUS_ENABLED=false
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
- ./grafana/dashboards:/var/lib/grafana/dashboards
depends_on:
- prometheus
restart: unless-stopped
networks:
- monitoring
node-exporter:
image: prom/node-exporter:v1.7.0
container_name: node-exporter
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--path.rootfs=/rootfs'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
restart: unless-stopped
networks:
- monitoring
volumes:
prometheus_data:
grafana_data:
networks:
monitoring:
driver: bridge
1.3 告警规则配置
# prometheus-rules-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rules
namespace: monitoring
data:
# 节点告警
node-alerts.yml: |
groups:
- name: node-alerts
rules:
- alert: NodeDown
expr: up{job="node-exporter"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "节点 {{ $labels.instance }} 宕机"
description: "节点 {{ $labels.instance }} 已经宕机超过 5 分钟"
- alert: NodeHighCPU
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "节点 {{ $labels.instance }} CPU 使用率过高"
description: "CPU 使用率: {{ $value }}%"
- alert: NodeHighMemory
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 10m
labels:
severity: warning
annotations:
summary: "节点 {{ $labels.instance }} 内存使用率过高"
description: "内存使用率: {{ $value }}%"
- alert: NodeDiskFull
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 85
for: 30m
labels:
severity: warning
annotations:
summary: "节点 {{ $labels.instance }} 磁盘空间不足"
description: "磁盘使用率: {{ $value }}%"
- alert: NodeDiskWillFillIn24h
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"}[6h], 24*3600) < 0
for: 30m
labels:
severity: critical
annotations:
summary: "节点 {{ $labels.instance }} 磁盘将在 24 小时内写满"
description: "当前可用空间预测将在 24 小时内耗尽"
# Kubernetes 告警
kubernetes-alerts.yml: |
groups:
- name: kubernetes-alerts
rules:
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 5 > 0
for: 15m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.pod }} 重启频繁"
description: "Pod {{ $labels.pod }} 在 15 分钟内重启了 {{ $value }} 次"
- alert: PodNotReady
expr: kube_pod_status_ready{condition="true"} == 0
for: 10m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.pod }} 未就绪"
description: "Pod {{ $labels.pod }} 已经 10 分钟未就绪"
- alert: DeploymentReplicasMismatch
expr: kube_deployment_status_replicas_available != kube_deployment_spec_replicas
for: 15m
labels:
severity: warning
annotations:
summary: "Deployment {{ $labels.deployment }} 副本数不匹配"
description: "期望 {{ $labels.kube_deployment_spec_replicas }} 副本,实际 {{ $labels.kube_deployment_status_replicas_available }} 副本"
- alert: NodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 10m
labels:
severity: critical
annotations:
summary: "节点 {{ $labels.node }} 未就绪"
description: "节点 {{ $labels.node }} 已经 10 分钟未就绪"
# 应用告警
application-alerts.yml: |
groups:
- name: application-alerts
rules:
- alert: HighErrorRate
expr: sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "应用错误率过高"
description: "5xx 错误率: {{ $value | humanizePercentage }}"
- alert: HighLatency
expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 1
for: 10m
labels:
severity: warning
annotations:
summary: "应用延迟过高"
description: "P99 延迟: {{ $value }}s"
- alert: ServiceDown
expr: up{job=~".+-app"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "服务 {{ $labels.job }} 宕机"
description: "服务 {{ $labels.job }} 已经宕机超过 2 分钟"
Grafana 部署与配置
2.1 数据源配置
# grafana/provisioning/datasources/prometheus.yml
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
jsonData:
timeInterval: "15s"
queryTimeout: "2m"
httpMethod: "POST"
2.2 告警通知配置
# grafana/provisioning/contact-points/alertmanager.yml
apiVersion: 1
contactPoints:
- orgId: 1
name: dingtalk
receivers:
- uid: dingtalk-webhook
type: dingding
settings:
url: https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN
message: |
## {{ .GroupLabels.alertname }}
**告警级别:** {{ .CommonLabels.severity }}
**告警时间:** {{ .StartsAt.Format "2006-01-02 15:04:05" }}
**描述:** {{ .CommonAnnotations.description }}
2.3 仪表板 JSON 示例 (Node Exporter)
{
"dashboard": {
"id": null,
"uid": "node-exporter-full",
"title": "Node Exporter Full",
"tags": ["linux"],
"timezone": "browser",
"schemaVersion": 38,
"version": 1,
"refresh": "30s",
"panels": [
{
"id": 1,
"type": "gauge",
"title": "CPU Usage",
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 0},
"targets": [
{
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 60},
{"color": "red", "value": 80}
]
}
}
}
},
{
"id": 2,
"type": "gauge",
"title": "Memory Usage",
"gridPos": {"h": 8, "w": 6, "x": 6, "y": 0},
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 70},
{"color": "red", "value": 85}
]
}
}
}
},
{
"id": 3,
"type": "gauge",
"title": "Disk Usage",
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 0},
"targets": [
{
"expr": "(1 - (node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay\"})) * 100",
"legendFormat": "{{instance}} - {{mountpoint}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 70},
{"color": "red", "value": 85}
]
}
}
}
},
{
"id": 4,
"type": "timeseries",
"title": "Network Traffic",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"targets": [
{
"expr": "rate(node_network_receive_bytes_total[5m])",
"legendFormat": "RX - {{device}}"
},
{
"expr": "-rate(node_network_transmit_bytes_total[5m])",
"legendFormat": "TX - {{device}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
}
}
}
]
}
}
关键参数调优
3.1 Prometheus 核心参数
| 参数 | 推荐值 | 说明 |
|---|---|---|
scrape_interval |
15s | 采集间隔,生产环境建议 15-30s |
scrape_timeout |
10s | 采集超时,应小于 scrape_interval |
evaluation_interval |
15s | 规则评估间隔 |
storage.tsdb.retention.time |
15d | 数据保留时间,根据存储调整 |
storage.tsdb.retention.size |
10GB | 数据保留大小上限 |
storage.tsdb.wal-compression |
true | 启用 WAL 压缩,节省 50% 磁盘 |
storage.tsdb.head-chunks-write-queue-size |
10000 | 头部块写入队列大小 |
query.max-concurrency |
40 | 最大并发查询数 |
query.timeout |
2m | 查询超时时间 |
web.max-connections |
512 | 最大 HTTP 连接数 |
3.2 内存调优
# 根据数据规模调整内存限制
# 小型集群 (<100 节点)
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
# 中型集群 (100-500 节点)
resources:
requests:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "8Gi"
cpu: "4000m"
# 大型集群 (>500 节点)
resources:
requests:
memory: "8Gi"
cpu: "4000m"
limits:
memory: "16Gi"
cpu: "8000m"
3.3 存储调优
# TSDB 块配置优化
--storage.tsdb.min-block-duration=2h # 最小块时长
--storage.tsdb.max-block-duration=2h # 最大块时长
--storage.tsdb.no-lockfile # 禁用锁文件 (容器环境)
# 磁盘性能优化
# 使用 SSD 存储
# 挂载选项:noatime,nodiratime
# 文件系统:ext4 或 xfs
3.4 抓取配置优化
scrape_configs:
# 高频监控 (核心服务)
- job_name: 'critical-services'
scrape_interval: 5s
scrape_timeout: 3s
static_configs: [...]
# 常规监控
- job_name: 'normal-services'
scrape_interval: 15s
scrape_timeout: 10s
static_configs: [...]
# 低频监控 (不关键指标)
- job_name: 'low-priority'
scrape_interval: 60s
scrape_timeout: 30s
static_configs: [...]
监控命令与故障排查
4.1 常用 PromQL 查询
# 系统概览
# CPU 使用率
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# 内存使用率
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
# 磁盘使用率
(1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100
# 网络流量
rate(node_network_receive_bytes_total[5m])
rate(node_network_transmit_bytes_total[5m])
# Kubernetes 相关
# Pod 重启次数
rate(kube_pod_container_status_restarts_total[15m])
# Deployment 副本状态
kube_deployment_status_replicas_available / kube_deployment_spec_replicas
# 节点状态
kube_node_status_condition{condition="Ready"}
# 应用指标
# 请求速率
sum(rate(http_requests_total[5m])) by (service)
# 错误率
sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))
# P99 延迟
histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
# 吞吐量预测
predict_linear(node_filesystem_avail_bytes[6h], 24*3600)
4.2 故障排查命令
# 检查 Prometheus 状态
curl -s http://localhost:9090/-/healthy
curl -s http://localhost:9090/-/ready
# 查看 targets 状态
curl -s http://localhost:9090/api/v1/targets | jq '.data.activeGroups[].targets[] | select(.health != "up")'
# 查看规则状态
curl -s http://localhost:9090/api/v1/rules | jq '.data.groups[].rules[] | select(.health != "ok")'
# 查询最近错误
curl -s 'http://localhost:9090/api/v1/query?query=sum(scrape_samples_scraped{job="node-exporter"})' | jq
# 检查 TSDB 状态
curl -s http://localhost:9090/api/v1/status/tsdb | jq
# 查看配置
curl -s http://localhost:9090/api/v1/status/config | jq '.data.config'
# 热重载配置
curl -X POST http://localhost:9090/-/reload
# 删除时间序列数据 (谨慎使用)
curl -X POST -g 'http://localhost:9090/api/v1/admin/tsdb/delete_series?match[]={job="test"}'
# 查看 WAL 文件
ls -lh /prometheus/wal/
# 检查块文件
ls -lh /prometheus/blocks/
# 内存使用分析
kubectl top pod -n monitoring prometheus-xxx
# 查看 Prometheus 日志
kubectl logs -n monitoring prometheus-xxx --tail=100
# Grafana 告警查询
# 查看告警历史
SELECT * FROM alert_rule WHERE state = 'firing'
# 查看通知记录
SELECT * FROM notification_log ORDER BY created DESC LIMIT 50
4.3 常见问题排查流程
1. 指标采集失败
├── 检查 target 状态: /api/v1/targets
├── 检查网络连通性: curl http://target:port/metrics
├── 检查防火墙规则
└── 检查服务发现配置
2. 查询缓慢/超时
├── 检查查询复杂度 (避免高基数 label)
├── 检查内存使用
├── 增加 query.timeout
└── 考虑使用 recording rules 预计算
3. 磁盘空间不足
├── 检查 retention 配置
├── 清理旧数据: delete_series API
├── 扩容 PVC
└── 启用压缩: --storage.tsdb.wal-compression
4. 内存溢出 (OOM)
├── 减少 scrape_interval
├── 减少保留时间
├── 增加内存限制
└── 检查高基数指标
5. 告警不触发
├── 检查规则语法
├── 检查规则状态: /api/v1/rules
├── 检查 Alertmanager 连接
└── 检查通知配置
最佳实践
5.1 架构设计
联邦架构 (大规模部署)
- 使用 Prometheus Federation 分层采集
- 边缘 Prometheus 采集原始数据
- 中心 Prometheus 聚合关键指标
高可用部署
- 至少部署 2 个 Prometheus 实例
- 使用 Thanos/Cortex 实现长期存储
- 配置 Alertmanager 集群
存储规划
- 生产环境至少保留 15 天数据
- 使用 SSD 存储提升性能
- 定期备份关键配置
5.2 指标规范
# 指标命名规范
# 格式:<namespace>_<subsystem>_<name>_<unit>
# 示例:
# http_requests_total # 计数器
# http_request_duration_seconds # 直方图
# node_memory_usage_bytes # 规范值
# app_connections_current # 当前值
# Label 规范
# 避免高基数 label (如 user_id, pod_id)
# 使用有限的枚举值
# 统一 label 命名 (instance, job, namespace, pod)
5.3 告警设计
# 告警分级
# P0 - Critical: 立即响应 (电话/短信)
# - 服务完全不可用
# - 数据丢失风险
# - 安全事件
# P1 - Warning: 工作时间响应 (钉钉/企微)
# - 性能下降
# - 资源即将耗尽
# - 非核心服务异常
# P2 - Info: 记录即可
# - 配置变更
# - 定期报告
# 告警规则设计原则
# 1. 避免告警风暴 (使用 group_wait, group_interval)
# 2. 设置合理的 for 持续时间
# 3. 包含清晰的 annotations
# 4. 定期审查和清理无效告警
5.4 安全加固
# 网络隔离
# - Prometheus 不暴露公网
# - 使用 ServiceMesh 控制访问
# - 配置 RBAC 限制 API 访问
# 认证授权
# - 启用基本认证或 OAuth
# - 配置 TLS 加密传输
# - 使用反向代理 (Nginx/Ingress)
# 数据安全
# - 定期备份配置
# - 敏感信息使用 Secret
# - 审计日志记录
常见问题
Q1: Prometheus 内存占用过高怎么办?
原因:
- 时间序列过多 (高基数指标)
- 保留时间过长
- 查询并发过高
解决方案:
# 1. 识别高基数指标
topk(10, count by (__name__) ({__name__=~".+"}))
# 2. 使用 metric_relabel_configs 丢弃不需要的指标
metric_relabel_configs:
- source_labels: [__name__]
regex: 'go_.*'
action: drop
# 3. 减少保留时间
--storage.tsdb.retention.time=7d
# 4. 增加内存限制
resources:
limits:
memory: "8Gi"
Q2: 如何迁移 Prometheus 数据?
# 方法 1: 直接复制数据目录
kubectl cp monitoring/prometheus-xxx:/prometheus ./backup
# 方法 2: 使用 Thanos 实现长期存储
# 配置对象存储 (S3/GCS)
# 启用 sidecar 上传块文件
# 方法 3: 远程写入 (Remote Write)
remote_write:
- url: "http://remote-storage:9201/write"
queue_config:
max_samples_per_send: 10000
batch_send_deadline: 5s
Q3: Grafana 仪表板加载缓慢?
优化方案:
{
"dashboard": {
"refresh": "30s", // 降低刷新频率
"time": {
"from": "now-1h", // 缩小时间范围
"to": "now"
},
"templating": {
"list": [] // 减少模板变量
}
}
}
# 使用 recording rules 预计算
# 原始查询
sum(rate(http_requests_total[5m])) by (service)
# Recording rule
- record: job:http_requests:rate5m
expr: sum(rate(http_requests_total[5m])) by (job)
# 仪表板使用预计算指标
sum(job:http_requests:rate5m) by (job)
Q4: 如何实现多集群监控?
# 方案 1: Prometheus 联邦
scrape_configs:
- job_name: 'federate'
honor_labels: true
metrics_path: '/federate'
params:
'match[]':
- '{job="kubernetes-nodes"}'
- '{__name__=~"job:.*"}'
static_configs:
- targets:
- 'prometheus-cluster1:9090'
- 'prometheus-cluster2:9090'
# 方案 2: Thanos Query
# 部署 Thanos Query 组件
# 配置 sidecar 连接各集群 Prometheus
# 通过 Thanos Query 统一查询
# 方案 3: VictoriaMetrics
# 使用 vmagent 收集多集群数据
# 统一存储到 VictoriaMetrics
Q5: 告警通知太多怎么办?
# Alertmanager 配置优化
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.example.com:587'
route:
receiver: 'default'
group_by: ['alertname', 'cluster']
group_wait: 30s # 等待 30s 收集同组告警
group_interval: 5m # 同组告警间隔 5m
repeat_interval: 4h # 重复告警间隔 4h
routes:
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 10s
repeat_interval: 1h
- match:
severity: warning
receiver: 'warning-alerts'
group_wait: 1m
repeat_interval: 4h
receivers:
- name: 'critical-alerts'
webhook_configs:
- url: 'http://dingtalk-webhook'
send_resolved: true
- name: 'warning-alerts'
webhook_configs:
- url: 'http://dingtalk-webhook'
send_resolved: true
- name: 'default'
webhook_configs:
- url: 'http://dingtalk-webhook'
send_resolved: true
# 告警抑制
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'cluster']
附录:快速部署脚本
#!/bin/bash
# deploy-monitoring.sh - 快速部署 Prometheus + Grafana
set -e
NAMESPACE="monitoring"
RELEASE_NAME="monitoring"
# 添加 Helm 仓库
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
# 创建命名空间
kubectl create namespace $NAMESPACE --dry-run=client -o yaml | kubectl apply -f -
# 部署 Prometheus (使用 kube-prometheus-stack)
helm upgrade --install $RELEASE_NAME prometheus-community/kube-prometheus-stack \
--namespace $NAMESPACE \
--set prometheus.prometheusSpec.retention=15d \
--set prometheus.prometheusSpec.resources.requests.memory=512Mi \
--set prometheus.prometheusSpec.resources.limits.memory=4Gi \
--set grafana.adminPassword=SecurePassword123! \
--set alertmanager.enabled=true \
--values values.yaml
# 等待部署完成
kubectl rollout status deployment/$RELEASE_NAME-kube-prometheus-sta-prometheus -n $NAMESPACE
kubectl rollout status deployment/$RELEASE_NAME-grafana -n $NAMESPACE
# 获取访问地址
echo "=== 访问地址 ==="
echo "Prometheus: http://localhost:9090"
echo "Grafana: http://localhost:3000"
echo "Alertmanager: http://localhost:9093"
# 端口转发
kubectl port-forward svc/$RELEASE_NAME-kube-prometheus-sta-prometheus 9090:9090 -n $NAMESPACE &
kubectl port-forward svc/$RELEASE_NAME-grafana 3000:80 -n $NAMESPACE &
kubectl port-forward svc/$RELEASE_NAME-kube-prometheus-sta-alertmanager 9093:9093 -n $NAMESPACE &
echo "部署完成!使用 Ctrl+C 停止端口转发"
文档版本: 1.0
最后更新: 2026-03-10
维护者: SRE Team