SRE 每日主题:Redis 集群运维与性能调优
日期: 2026-03-18
主题序号: 5
难度等级: ⭐⭐⭐⭐
适用场景: 生产环境 Redis 集群部署与运维
一、生产环境部署架构
1.1 Redis 集群架构(Cluster Mode)
┌─────────────────────────────────────────┐
│ 客户端连接层 │
│ (Redis Cluster-aware Client SDK) │
└──────────────────┬──────────────────────┘
│
┌──────────────────────┼──────────────────────┐
│ │ │
▼ ▼ ▼
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
│ Master-01 │ │ Master-02 │ │ Master-03 │
│ Slot: 0-5460 │ │ Slot: 5461-10922│ │ Slot: 10923-16383│
│ Port: 6379 │ │ Port: 6379 │ │ Port: 6379 │
└───────┬───────┘ └───────┬───────┘ └───────┬───────┘
│ │ │
┌───────┴───────┐ ┌───────┴───────┐ ┌───────┴───────┐
│ Replica-01 │ │ Replica-02 │ │ Replica-03 │
│ Port: 6379 │ │ Port: 6379 │ │ Port: 6379 │
└───────────────┘ └───────────────┘ └───────────────┘
1.2 推荐部署架构(生产环境)
┌─────────────────────────────────────────────────────────────────────────┐
│ 应用层 (多实例) │
│ Redis Cluster-aware SDK │
└────────────────────────────────┬────────────────────────────────────────┘
│
┌────────────┴────────────┐
│ 负载均衡 / VIP │
│ (HAProxy / ProxySQL) │
└────────────┬────────────┘
│
┌────────────────────────┼────────────────────────┐
│ │ │
▼ ▼ ▼
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
│ Shard-01 │ │ Shard-02 │ │ Shard-03 │
│ │ │ │ │ │
│ ┌───────────┐ │ │ ┌───────────┐ │ │ ┌───────────┐ │
│ │ Master │ │ │ │ Master │ │ │ │ Master │ │
│ │ 10.0.1.1 │ │ │ │ 10.0.2.1 │ │ │ │ 10.0.3.1 │ │
│ └─────┬─────┘ │ │ └─────┬─────┘ │ │ └─────┬─────┘ │
│ │ │ │ │ │ │ │ │
│ ┌─────┴─────┐ │ │ ┌─────┴─────┐ │ │ ┌─────┴─────┐ │
│ │ Replica │ │ │ │ Replica │ │ │ │ Replica │ │
│ │ 10.0.1.2 │ │ │ │ 10.0.2.2 │ │ │ │ 10.0.3.2 │ │
│ └───────────┘ │ │ └───────────┘ │ │ └───────────┘ │
│ │ │ │ │ │
│ Sentinel │ │ Sentinel │ │ Sentinel │
│ (仲裁节点) │ │ (仲裁节点) │ │ (仲裁节点) │
└───────────────┘ └───────────────┘ └───────────────┘
│ │ │
└────────────────────────┴────────────────────────┘
│
┌────────────┴────────────┐
│ 监控告警系统 │
│ Prometheus + Grafana │
└─────────────────────────┘
1.3 Kubernetes 部署配置
# redis-cluster.yaml - 生产环境 Redis Cluster 部署
apiVersion: v1
kind: ConfigMap
metadata:
name: redis-cluster-config
namespace: redis
data:
redis.conf: |
# 基础配置
bind 0.0.0.0
port 6379
protected-mode no
daemonize no
pidfile /var/run/redis/redis-server.pid
logfile /var/log/redis/redis.log
dir /data
# 集群配置
cluster-enabled yes
cluster-config-file /data/nodes.conf
cluster-node-timeout 5000
cluster-announce-ip ""
cluster-announce-port 6379
cluster-announce-bus-port 16379
cluster-require-full-coverage yes
cluster-migration-barrier 1
# 内存管理
maxmemory 8gb
maxmemory-policy allkeys-lru
maxmemory-samples 5
# 持久化配置
appendonly yes
appendfsync everysec
appendfilename "appendonly.aof"
auto-aof-rewrite-percentage 100
auto-aof-rewrite-min-size 64mb
# RDB 快照
save 900 1
save 300 10
save 60 10000
stop-writes-on-bgsave-error yes
rdbcompression yes
rdbchecksum yes
dbfilename dump.rdb
# 性能优化
tcp-backlog 511
tcp-keepalive 300
timeout 0
slowlog-log-slower-than 10000
slowlog-max-len 128
latency-monitor-threshold 100
# 安全配置
# requirepass ${REDIS_PASSWORD}
# masterauth ${REDIS_PASSWORD}
rename-command FLUSHDB ""
rename-command FLUSHALL ""
rename-command KEYS ""
rename-command DEBUG ""
# 客户端限制
maxclients 10000
# 网络优化
client-output-buffer-limit normal 0 0 0
client-output-buffer-limit replica 256mb 64mb 60
client-output-buffer-limit pubsub 32mb 8mb 60
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: redis-cluster
namespace: redis
spec:
serviceName: redis-cluster
replicas: 6
selector:
matchLabels:
app: redis-cluster
template:
metadata:
labels:
app: redis-cluster
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app: redis-cluster
topologyKey: kubernetes.io/hostname
containers:
- name: redis
image: redis:7.2-alpine
ports:
- containerPort: 6379
name: redis
- containerPort: 16379
name: cluster
command:
- redis-server
- /etc/redis/redis.conf
resources:
requests:
cpu: 2000m
memory: 10Gi
limits:
cpu: 4000m
memory: 12Gi
volumeMounts:
- name: redis-config
mountPath: /etc/redis
- name: redis-data
mountPath: /data
- name: redis-log
mountPath: /var/log/redis
env:
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
livenessProbe:
exec:
command:
- redis-cli
- ping
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
readinessProbe:
exec:
command:
- redis-cli
- ping
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
volumes:
- name: redis-config
configMap:
name: redis-cluster-config
- name: redis-log
emptyDir: {}
volumeClaimTemplates:
- metadata:
name: redis-data
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: ssd-storage
resources:
requests:
storage: 50Gi
---
apiVersion: v1
kind: Service
metadata:
name: redis-cluster
namespace: redis
spec:
type: ClusterIP
clusterIP: None
ports:
- port: 6379
targetPort: 6379
name: redis
- port: 16379
targetPort: 16379
name: cluster
selector:
app: redis-cluster
---
apiVersion: v1
kind: Service
metadata:
name: redis-cluster-access
namespace: redis
spec:
type: ClusterIP
ports:
- port: 6379
targetPort: 6379
name: redis
selector:
app: redis-cluster
1.4 Docker Compose 快速部署(测试环境)
# docker-compose.yml - Redis Cluster 测试环境
version: '3.8'
services:
redis-node-1:
image: redis:7.2-alpine
command: redis-server /etc/redis/redis.conf
ports:
- "6379:6379"
- "16379:16379"
volumes:
- ./redis.conf:/etc/redis/redis.conf
- redis-data-1:/data
networks:
redis-net:
ipv4_address: 172.30.0.11
redis-node-2:
image: redis:7.2-alpine
command: redis-server /etc/redis/redis.conf
ports:
- "6380:6379"
- "16380:16379"
volumes:
- ./redis.conf:/etc/redis/redis.conf
- redis-data-2:/data
networks:
redis-net:
ipv4_address: 172.30.0.12
redis-node-3:
image: redis:7.2-alpine
command: redis-server /etc/redis/redis.conf
ports:
- "6381:6379"
- "16381:16379"
volumes:
- ./redis.conf:/etc/redis/redis.conf
- redis-data-3:/data
networks:
redis-net:
ipv4_address: 172.30.0.13
redis-node-4:
image: redis:7.2-alpine
command: redis-server /etc/redis/redis.conf
ports:
- "6382:6379"
- "16382:16379"
volumes:
- ./redis.conf:/etc/redis/redis.conf
- redis-data-4:/data
networks:
redis-net:
ipv4_address: 172.30.0.14
redis-node-5:
image: redis:7.2-alpine
command: redis-server /etc/redis/redis.conf
ports:
- "6383:6379"
- "16383:16379"
volumes:
- ./redis.conf:/etc/redis/redis.conf
- redis-data-5:/data
networks:
redis-net:
ipv4_address: 172.30.0.15
redis-node-6:
image: redis:7.2-alpine
command: redis-server /etc/redis/redis.conf
ports:
- "6384:6379"
- "16384:16379"
volumes:
- ./redis.conf:/etc/redis/redis.conf
- redis-data-6:/data
networks:
redis-net:
ipv4_address: 172.30.0.16
# 集群初始化容器
redis-cluster-init:
image: redis:7.2-alpine
depends_on:
- redis-node-1
- redis-node-2
- redis-node-3
- redis-node-4
- redis-node-5
- redis-node-6
command: >
sh -c "sleep 10 &&
redis-cli --cluster create
172.30.0.11:6379 172.30.0.12:6379 172.30.0.13:6379
172.30.0.14:6379 172.30.0.15:6379 172.30.0.16:6379
--cluster-replicas 1 --cluster-yes"
networks:
- redis-net
networks:
redis-net:
driver: bridge
ipam:
config:
- subnet: 172.30.0.0/24
volumes:
redis-data-1:
redis-data-2:
redis-data-3:
redis-data-4:
redis-data-5:
redis-data-6:
二、关键参数调优
2.1 Redis 核心配置详解
# ============================================
# /etc/redis/redis.conf - 生产环境配置详解
# ============================================
# ----------------------------------------
# 网络配置
# ----------------------------------------
# 绑定地址,生产环境建议绑定内网 IP
bind 0.0.0.0
# 端口号
port 6379
# TCP 连接队列长度,需配合内核 somaxconn 调整
tcp-backlog 511
# TCP keepalive,检测死连接
tcp-keepalive 300
# 客户端空闲超时(0 表示禁用)
timeout 0
# ----------------------------------------
# 通用配置
# ----------------------------------------
# 是否以守护进程运行(Docker/K8s 环境设为 no)
daemonize no
# PID 文件路径
pidfile /var/run/redis/redis-server.pid
# 日志级别: debug, verbose, notice, warning
loglevel notice
# 日志文件路径
logfile /var/log/redis/redis.log
# 数据库数量
databases 16
# 启动时是否显示 Logo
always-show-logo no
# ----------------------------------------
# 快照配置 (RDB)
# ----------------------------------------
# 保存策略:秒内至少 N 次变更触发快照
# 生产环境根据数据重要性调整
save 900 1 # 15分钟内至少1次变更
save 300 10 # 5分钟内至少10次变更
save 60 10000 # 1分钟内至少10000次变更
# BGSAVE 失败时停止写入
stop-writes-on-bgsave-error yes
# RDB 压缩(消耗 CPU,节省空间)
rdbcompression yes
# RDB 校验(约 10% 性能开销)
rdbchecksum yes
# RDB 文件名
dbfilename dump.rdb
# 数据目录
dir /data
# ----------------------------------------
# 复制配置
# ----------------------------------------
# 副本只读
replica-read-only yes
# 复制积压缓冲区大小(PSYNC 增量同步用)
repl-backlog-size 256mb
# 复制积压缓冲区存活时间
repl-backlog-ttl 3600
# 复制连接断开后是否同步 RDB
# no: 只使用 PSYNC 增量同步
replica-serve-stale-data yes
# 复制同步策略:diskless 直接通过网络传输
repl-diskless-sync yes
repl-diskless-sync-delay 5
# 复制超时时间
repl-timeout 60
# ----------------------------------------
# 安全配置
# ----------------------------------------
# 访问密码(建议使用强密码)
requirepass your_strong_password_here
# 主从复制密码
masterauth your_strong_password_here
# 危险命令重命名(安全加固)
rename-command FLUSHDB ""
rename-command FLUSHALL ""
rename-command KEYS ""
rename-command CONFIG ""
rename-command DEBUG ""
rename-command SHUTDOWN "SHUTDOWN_$(echo $RANDOM | md5sum | head -c 16)"
# ----------------------------------------
# 客户端限制
# ----------------------------------------
# 最大客户端连接数
maxclients 10000
# 客户端输出缓冲区限制
# 格式: <class> <hard limit> <soft limit> <soft seconds>
# normal: 普通客户端
# replica: 副本客户端
# pubsub: 发布订阅客户端
client-output-buffer-limit normal 0 0 0
client-output-buffer-limit replica 256mb 64mb 60
client-output-buffer-limit pubsub 32mb 8mb 60
# 查询缓冲区大小限制
client-query-buffer-limit 1gb
# ----------------------------------------
# 内存管理
# ----------------------------------------
# 最大内存限制(建议留 20% 给系统)
maxmemory 8gb
# 内存淘汰策略:
# - noeviction: 不淘汰,内存满时拒绝写入
# - allkeys-lru: 从所有键中淘汰最近最少使用
# - volatile-lru: 从过期键中淘汰最近最少使用
# - allkeys-lfu: 从所有键中淘汰最不常用
# - volatile-lfu: 从过期键中淘汰最不常用
# - allkeys-random: 随机淘汰
# - volatile-random: 随机淘汰过期键
# - volatile-ttl: 淘汰即将过期的键
maxmemory-policy allkeys-lru
# LRU/LFU 采样数量,越大越精确但越慢
maxmemory-samples 5
# 副本忽略 maxmemory(保证复制完整性)
replica-ignore-maxmemory yes
# ----------------------------------------
# 懒删除(异步删除大 Key)
# ----------------------------------------
# 异步删除过期键
lazyfree-lazy-eviction no
lazyfree-lazy-expire no
lazyfree-lazy-server-del no
replica-lazy-flush no
# ----------------------------------------
# AOF 持久化
# ----------------------------------------
# 启用 AOF
appendonly yes
# AOF 文件名
appendfilename "appendonly.aof"
# AOF 同步策略:
# - always: 每次写入都同步(最安全,最慢)
# - everysec: 每秒同步(推荐)
# - no: 由操作系统决定(最快,可能丢数据)
appendfsync everysec
# AOF 重写期间是否禁用 fsync
no-appendfsync-on-rewrite no
# AOF 重写触发条件
auto-aof-rewrite-percentage 100
auto-aof-rewrite-min-size 64mb
# AOF 文件加载时忽略末尾截断
aof-load-truncated yes
# 启用 RDB-AOF 混合持久化(Redis 4.0+)
aof-use-rdb-preamble yes
# ----------------------------------------
# Lua 脚本
# ----------------------------------------
# Lua 脚本最大执行时间(毫秒),0 表示无限制
lua-time-limit 5000
# ----------------------------------------
# Redis 集群配置
# ----------------------------------------
# 启用集群模式
cluster-enabled yes
# 集群配置文件(自动生成)
cluster-config-file /data/nodes.conf
# 节点超时时间(毫秒)
cluster-node-timeout 5000
# 从节点有效因子
cluster-slave-validity-factor 10
# 是否要求完整覆盖(槽位不全时是否响应)
cluster-require-full-coverage yes
# 迁移屏障
cluster-migration-barrier 1
# ----------------------------------------
# 慢查询日志
# ----------------------------------------
# 慢查询阈值(微秒),负数表示禁用
slowlog-log-slower-than 10000
# 慢查询日志最大长度
slowlog-max-len 128
# ----------------------------------------
# 延迟监控
# ----------------------------------------
# 延迟监控阈值(毫秒),0 表示禁用
latency-monitor-threshold 100
# ----------------------------------------
# 事件通知
# ----------------------------------------
# 键空间通知(按需开启)
notify-keyspace-events ""
# ----------------------------------------
# 高级配置
# ----------------------------------------
# Hash 数据结构优化阈值
hash-max-ziplist-entries 512
hash-max-ziplist-value 64
# List 数据结构优化阈值
list-max-ziplist-size -2
list-compress-depth 0
# Set 数据结构优化阈值
set-max-intset-entries 512
# Sorted Set 数据结构优化阈值
zset-max-ziplist-entries 128
zset-max-ziplist-value 64
# HyperLogLog 稀疏表示阈值
hll-sparse-max-bytes 3000
# Stream 数据结构优化阈值
stream-node-max-bytes 4096
stream-node-max-entries 100
# 主动碎片整理
activedefrag yes
active-defrag-ignore-bytes 100mb
active-defrag-threshold-lower 10
active-defrag-threshold-upper 100
active-defrag-cycle-min 1
active-defrag-cycle-max 25
2.2 Sentinel 配置详解
# /etc/redis/sentinel.conf - 哨兵配置
port 26379
daemonize no
pidfile /var/run/redis/redis-sentinel.pid
logfile /var/log/redis/sentinel.log
dir /tmp
# 监控主节点
# mymaster: 主节点名称
# 127.0.0.1 6379: 主节点地址
# 2: 判定主节点下线所需的哨兵数量
sentinel monitor mymaster 127.0.0.1 6379 2
# 主节点密码
sentinel auth-pass mymaster your_strong_password_here
# 判定主节点主观下线的超时时间(毫秒)
sentinel down-after-milliseconds mymaster 30000
# 故障转移超时时间(毫秒)
sentinel failover-timeout mymaster 180000
# 同时可以对新主节点进行复制的副本数量
sentinel parallel-syncs mymaster 1
# 故障转移后执行脚本
sentinel client-reconfig-script mymaster /etc/redis/failover-notify.sh
# 通知脚本(任何警告级别事件触发)
sentinel notification-script mymaster /etc/redis/notify.sh
三、系统内核/OS 层优化
3.1 系统内核参数调优
# /etc/sysctl.conf - Redis 优化内核参数
# 或放入 /etc/sysctl.d/99-redis.conf
# ============================================
# 内存管理
# ============================================
# 禁用 THP(Transparent Huge Pages)
# THP 会导致 Redis 延迟抖动
echo never > /sys/kernel/mm/transparent_hugepage/enabled
echo never > /sys/kernel/mm/transparent_hugepage/defrag
# 最大内存映射数量(AOF/RDB 需要)
vm.max_map_count = 262144
# 交换分区使用倾向(1 表示尽量避免使用)
vm.swappiness = 1
# 脏页比例(写缓冲)
vm.dirty_ratio = 30
vm.dirty_background_ratio = 10
# ============================================
# 网络优化
# ============================================
# 最大打开文件数
fs.file-max = 1000000
# TCP 连接队列
net.core.somaxconn = 65535
net.ipv4.tcp_max_syn_backlog = 65535
# TCP 连接复用
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_fin_timeout = 30
# TCP Keepalive
net.ipv4.tcp_keepalive_time = 300
net.ipv4.tcp_keepalive_intvl = 30
net.ipv4.tcp_keepalive_probes = 3
# TCP 内存自动调节
net.ipv4.tcp_mem = 786432 1048576 1572864
net.core.rmem_default = 262144
net.core.rmem_max = 16777216
net.core.wmem_default = 262144
net.core.wmem_max = 16777216
# 网卡队列长度
net.core.netdev_max_backlog = 65535
# ============================================
# 进程调度
# ============================================
# 最大进程数
kernel.pid_max = 4194303
# ============================================
# 文件系统
# ============================================
# Inotify 监控数量
fs.inotify.max_user_instances = 8192
fs.inotify.max_user_watches = 524288
# 应用配置
sysctl -p
3.2 用户资源限制
# /etc/security/limits.conf - Redis 用户资源限制
# 最大打开文件数
redis soft nofile 65536
redis hard nofile 65536
# 最大进程数
redis soft nproc 65536
redis hard nproc 65536
# 锁定内存(禁用 swap)
redis soft memlock unlimited
redis hard memlock unlimited
# 最大信号量数量
redis soft sigpending 65536
redis hard sigpending 65536
3.3 系统服务优化
# 禁用 THP 服务
cat > /etc/systemd/system/disable-thp.service << 'EOF'
[Unit]
Description=Disable Transparent Huge Pages (THP)
DefaultDependencies=no
After=sysinit.target local-fs.target
Before=basic.target
[Service]
Type=oneshot
ExecStart=/bin/sh -c 'echo never > /sys/kernel/mm/transparent_hugepage/enabled'
ExecStart=/bin/sh -c 'echo never > /sys/kernel/mm/transparent_hugepage/defrag'
[Install]
WantedBy=basic.target
EOF
systemctl daemon-reload
systemctl enable disable-thp
systemctl start disable-thp
3.4 磁盘 I/O 调度优化
# 查看当前调度器
cat /sys/block/sda/queue/scheduler
# 设置为 noop 或 deadline(SSD 推荐)
echo noop > /sys/block/sda/queue/scheduler
# 或
echo deadline > /sys/block/sda/queue/scheduler
# 永久设置(添加到 /etc/rc.local 或 systemd 服务)
# 对于 SSD,推荐使用 noop
# 对于 HDD,推荐使用 deadline 或 cfq
3.5 NUMA 优化
# 查看 NUMA 拓扑
numactl --hardware
# 绑定 Redis 到特定 NUMA 节点
numactl --cpunodebind=0 --membind=0 /usr/bin/redis-server /etc/redis/redis.conf
# 或使用 systemd 服务配置
cat > /etc/systemd/system/redis.service << 'EOF'
[Unit]
Description=Redis Server
After=network.target
[Service]
Type=notify
User=redis
Group=redis
ExecStart=/usr/bin/redis-server /etc/redis/redis.conf
ExecStop=/usr/bin/redis-cli shutdown
Restart=always
LimitNOFILE=65536
# NUMA 绑定
NUMAPolicy=bind
NUMAMask=0
[Install]
WantedBy=multi-user.target
EOF
四、监控与告警
4.1 Prometheus Redis Exporter 配置
# prometheus.yml - Redis 监控配置
scrape_configs:
- job_name: 'redis'
static_configs:
- targets:
- redis-node-01:9121
- redis-node-02:9121
- redis-node-03:9121
- redis-node-04:9121
- redis-node-05:9121
- redis-node-06:9121
relabel_configs:
- source_labels: [__address__]
regex: '([^:]+):\d+'
target_label: instance
replacement: '${1}'
- job_name: 'redis-cluster'
static_configs:
- targets:
- 'redis-exporter:9121'
metrics_path: /scrape
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: redis-exporter:9121
4.2 Redis Exporter 启动配置
# docker-compose.yml - Redis Exporter
version: '3.8'
services:
redis-exporter:
image: oliver006/redis_exporter:latest
ports:
- "9121:9121"
environment:
- REDIS_ADDR=redis://redis-node-01:6379
- REDIS_PASSWORD=your_password
- REDIS_EXPORTER_DEBUG=false
- REDIS_EXPORTER_INCL_SYSTEM_METRICS=true
- REDIS_EXPORTER_SKIP_TLS_VERIFICATION=true
# 监控多个 Redis 实例
- REDIS_EXPORTER_REDIS_ONLY_METRICS=false
command:
- '--web.listen-address=:9121'
- '--web.telemetry-path=/metrics'
- '--log-format=txt'
- '--check-single-keys=db0=keys-*'
- '--check-key-groups=db0=count,avg'
restart: always
4.3 Grafana 面板 JSON
{
"dashboard": {
"title": "Redis Cluster Monitor",
"uid": "redis-cluster-monitor",
"tags": ["redis", "database", "cluster"],
"timezone": "browser",
"refresh": "30s",
"panels": [
{
"title": "Redis 内存使用率",
"type": "gauge",
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 0},
"targets": [
{
"expr": "redis_memory_used_bytes / redis_memory_max_bytes * 100",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 70},
{"color": "red", "value": 85}
]
}
}
}
},
{
"title": "QPS (每秒操作数)",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 6, "y": 0},
"targets": [
{
"expr": "rate(redis_commands_processed_total[1m])",
"legendFormat": "{{instance}}",
"refId": "A"
}
]
},
{
"title": "连接数",
"type": "timeseries",
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 8},
"targets": [
{
"expr": "redis_connected_clients",
"legendFormat": "{{instance}}",
"refId": "A"
}
]
},
{
"title": "Key 过期/淘汰统计",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 6, "y": 8},
"targets": [
{
"expr": "rate(redis_expired_keys_total[1m])",
"legendFormat": "过期 {{instance}}",
"refId": "A"
},
{
"expr": "rate(redis_evicted_keys_total[1m])",
"legendFormat": "淘汰 {{instance}}",
"refId": "B"
}
]
},
{
"title": "复制延迟",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
"targets": [
{
"expr": "redis_replica_lag_seconds",
"legendFormat": "{{instance}}",
"refId": "A"
}
]
},
{
"title": "慢日志统计",
"type": "stat",
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 16},
"targets": [
{
"expr": "increase(redis_slowlog_length[5m])",
"legendFormat": "慢查询数",
"refId": "A"
}
]
},
{
"title": "集群状态",
"type": "stat",
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 20},
"targets": [
{
"expr": "redis_cluster_state",
"legendFormat": "集群状态",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"mappings": [
{"type": "value", "options": {"0": {"text": "Fail", "color": "red"}}},
{"type": "value", "options": {"1": {"text": "OK", "color": "green"}}}
]
}
}
},
{
"title": "持久化状态",
"type": "table",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 24},
"targets": [
{
"expr": "redis_rdb_last_bgsave_status",
"format": "table",
"refId": "A"
},
{
"expr": "redis_aof_last_bgrewrite_status",
"format": "table",
"refId": "B"
}
]
},
{
"title": "网络带宽",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 24},
"targets": [
{
"expr": "rate(redis_net_input_bytes_total[1m])",
"legendFormat": "入站 {{instance}}",
"refId": "A"
},
{
"expr": "rate(redis_net_output_bytes_total[1m])",
"legendFormat": "出站 {{instance}}",
"refId": "B"
}
]
},
{
"title": "命令执行时间分布",
"type": "heatmap",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 32},
"targets": [
{
"expr": "histogram_quantile(0.99, rate(redis_command_duration_seconds_bucket[5m]))",
"legendFormat": "P99",
"refId": "A"
}
],
"dataFormat": "timeseries"
},
{
"title": "键空间分布",
"type": "piechart",
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 32},
"targets": [
{
"expr": "redis_db_keys{db!~\"db[0-9]+_expires\"}",
"legendFormat": "{{db}}",
"refId": "A"
}
]
},
{
"title": "客户端连接分布",
"type": "piechart",
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 32},
"targets": [
{
"expr": "redis_connected_clients",
"legendFormat": "{{instance}}",
"refId": "A"
}
]
}
]
}
}
4.4 Alertmanager 告警规则
# redis-alerts.yml - Redis 告警规则
groups:
- name: redis
rules:
# ===== 集群状态告警 =====
- alert: RedisClusterDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
team: sre
annotations:
summary: "Redis 节点 {{ $labels.instance }} 不可用"
description: "Redis 节点 {{ $labels.instance }} 已经宕机超过 1 分钟"
- alert: RedisClusterStateDown
expr: redis_cluster_state == 0
for: 30s
labels:
severity: critical
team: sre
annotations:
summary: "Redis 集群状态异常"
description: "Redis 集群状态为 FAIL,可能存在槽位未覆盖"
- alert: RedisClusterSlotsDown
expr: redis_cluster_slots_assigned != 16384
for: 1m
labels:
severity: critical
team: sre
annotations:
summary: "Redis 集群槽位分配不完整"
description: "当前已分配槽位 {{ $value }},应为 16384"
# ===== 内存告警 =====
- alert: RedisMemoryHighUsage
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 85
for: 5m
labels:
severity: warning
team: sre
annotations:
summary: "Redis 内存使用率过高"
description: "{{ $labels.instance }} 内存使用率 {{ $value | printf \"%.2f\" }}%"
- alert: RedisMemoryCriticalUsage
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 95
for: 1m
labels:
severity: critical
team: sre
annotations:
summary: "Redis 内存即将耗尽"
description: "{{ $labels.instance }} 内存使用率 {{ $value | printf \"%.2f\" }}%,请立即处理"
- alert: RedisOOM
expr: increase(redis_memory_used_peak_bytes[5m]) > 0 and redis_memory_used_bytes >= redis_memory_max_bytes
for: 1m
labels:
severity: critical
team: sre
annotations:
summary: "Redis 已触发 OOM"
description: "{{ $labels.instance }} 已达到最大内存限制,开始淘汰键"
# ===== 连接告警 =====
- alert: RedisTooManyConnections
expr: redis_connected_clients / redis_config_maxclients * 100 > 80
for: 5m
labels:
severity: warning
team: sre
annotations:
summary: "Redis 连接数过高"
description: "{{ $labels.instance }} 连接数 {{ $value | printf \"%.2f\" }}%"
- alert: RedisRejectedConnections
expr: increase(redis_rejected_connections_total[5m]) > 0
for: 1m
labels:
severity: critical
team: sre
annotations:
summary: "Redis 拒绝连接"
description: "{{ $labels.instance }} 因 maxclients 限制拒绝连接"
# ===== 复制告警 =====
- alert: RedisReplicaLag
expr: redis_replica_lag_seconds > 10
for: 5m
labels:
severity: warning
team: sre
annotations:
summary: "Redis 副本同步延迟过高"
description: "{{ $labels.instance }} 副本延迟 {{ $value }} 秒"
- alert: RedisReplicaDisconnected
expr: redis_replica_connected == 0 and redis_instance_role == "slave"
for: 1m
labels:
severity: critical
team: sre
annotations:
summary: "Redis 副本与主节点断开"
description: "{{ $labels.instance }} 副本连接已断开"
# ===== 持久化告警 =====
- alert: RedisRDBSaveFailed
expr: redis_rdb_last_bgsave_status != 1
for: 1m
labels:
severity: critical
team: sre
annotations:
summary: "Redis RDB 快照保存失败"
description: "{{ $labels.instance }} 最近一次 BGSAVE 失败"
- alert: RedisAOFRewriteFailed
expr: redis_aof_last_bgrewrite_status != 1
for: 1m
labels:
severity: critical
team: sre
annotations:
summary: "Redis AOF 重写失败"
description: "{{ $labels.instance }} 最近一次 AOF 重写失败"
- alert: RedisPersistenceStalled
expr: time() - redis_rdb_last_save_timestamp_seconds > 3600 and redis_rdb_changes_since_last_save > 0
for: 5m
labels:
severity: warning
team: sre
annotations:
summary: "Redis 持久化长时间未执行"
description: "{{ $labels.instance }} 超过 1 小时未执行 RDB 快照"