SRE 每日主题:Redis 集群运维与性能调优

日期: 2026-03-18
主题序号: 5
难度等级: ⭐⭐⭐⭐
适用场景: 生产环境 Redis 集群部署与运维


一、生产环境部署架构

1.1 Redis 集群架构(Cluster Mode)

                        ┌─────────────────────────────────────────┐
                        │            客户端连接层                  │
                        │   (Redis Cluster-aware Client SDK)      │
                        └──────────────────┬──────────────────────┘
                                           │
                    ┌──────────────────────┼──────────────────────┐
                    │                      │                      │
                    ▼                      ▼                      ▼
           ┌───────────────┐      ┌───────────────┐      ┌───────────────┐
           │  Master-01    │      │  Master-02    │      │  Master-03    │
           │  Slot: 0-5460 │      │  Slot: 5461-10922│   │  Slot: 10923-16383│
           │  Port: 6379   │      │  Port: 6379   │      │  Port: 6379   │
           └───────┬───────┘      └───────┬───────┘      └───────┬───────┘
                   │                      │                      │
           ┌───────┴───────┐      ┌───────┴───────┐      ┌───────┴───────┐
           │  Replica-01   │      │  Replica-02   │      │  Replica-03   │
           │  Port: 6379   │      │  Port: 6379   │      │  Port: 6379   │
           └───────────────┘      └───────────────┘      └───────────────┘

1.2 推荐部署架构(生产环境)

┌─────────────────────────────────────────────────────────────────────────┐
│                           应用层 (多实例)                                │
│                    Redis Cluster-aware SDK                              │
└────────────────────────────────┬────────────────────────────────────────┘
                                 │
                    ┌────────────┴────────────┐
                    │    负载均衡 / VIP        │
                    │    (HAProxy / ProxySQL) │
                    └────────────┬────────────┘
                                 │
        ┌────────────────────────┼────────────────────────┐
        │                        │                        │
        ▼                        ▼                        ▼
┌───────────────┐        ┌───────────────┐        ┌───────────────┐
│   Shard-01    │        │   Shard-02    │        │   Shard-03    │
│               │        │               │        │               │
│ ┌───────────┐ │        │ ┌───────────┐ │        │ ┌───────────┐ │
│ │  Master   │ │        │ │  Master   │ │        │ │  Master   │ │
│ │  10.0.1.1 │ │        │ │  10.0.2.1 │ │        │ │  10.0.3.1 │ │
│ └─────┬─────┘ │        │ └─────┬─────┘ │        │ └─────┬─────┘ │
│       │       │        │       │       │        │       │       │
│ ┌─────┴─────┐ │        │ ┌─────┴─────┐ │        │ ┌─────┴─────┐ │
│ │  Replica  │ │        │ │  Replica  │ │        │ │  Replica  │ │
│ │  10.0.1.2 │ │        │ │  10.0.2.2 │ │        │ │  10.0.3.2 │ │
│ └───────────┘ │        │ └───────────┘ │        │ └───────────┘ │
│               │        │               │        │               │
│  Sentinel     │        │  Sentinel     │        │  Sentinel     │
│  (仲裁节点)   │        │  (仲裁节点)   │        │  (仲裁节点)   │
└───────────────┘        └───────────────┘        └───────────────┘
        │                        │                        │
        └────────────────────────┴────────────────────────┘
                                 │
                    ┌────────────┴────────────┐
                    │      监控告警系统        │
                    │  Prometheus + Grafana   │
                    └─────────────────────────┘

1.3 Kubernetes 部署配置

# redis-cluster.yaml - 生产环境 Redis Cluster 部署
apiVersion: v1
kind: ConfigMap
metadata:
  name: redis-cluster-config
  namespace: redis
data:
  redis.conf: |
    # 基础配置
    bind 0.0.0.0
    port 6379
    protected-mode no
    daemonize no
    pidfile /var/run/redis/redis-server.pid
    logfile /var/log/redis/redis.log
    dir /data

    # 集群配置
    cluster-enabled yes
    cluster-config-file /data/nodes.conf
    cluster-node-timeout 5000
    cluster-announce-ip ""
    cluster-announce-port 6379
    cluster-announce-bus-port 16379
    cluster-require-full-coverage yes
    cluster-migration-barrier 1

    # 内存管理
    maxmemory 8gb
    maxmemory-policy allkeys-lru
    maxmemory-samples 5

    # 持久化配置
    appendonly yes
    appendfsync everysec
    appendfilename "appendonly.aof"
    auto-aof-rewrite-percentage 100
    auto-aof-rewrite-min-size 64mb

    # RDB 快照
    save 900 1
    save 300 10
    save 60 10000
    stop-writes-on-bgsave-error yes
    rdbcompression yes
    rdbchecksum yes
    dbfilename dump.rdb

    # 性能优化
    tcp-backlog 511
    tcp-keepalive 300
    timeout 0
    slowlog-log-slower-than 10000
    slowlog-max-len 128
    latency-monitor-threshold 100

    # 安全配置
    # requirepass ${REDIS_PASSWORD}
    # masterauth ${REDIS_PASSWORD}
    rename-command FLUSHDB ""
    rename-command FLUSHALL ""
    rename-command KEYS ""
    rename-command DEBUG ""

    # 客户端限制
    maxclients 10000

    # 网络优化
    client-output-buffer-limit normal 0 0 0
    client-output-buffer-limit replica 256mb 64mb 60
    client-output-buffer-limit pubsub 32mb 8mb 60

---
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: redis-cluster
  namespace: redis
spec:
  serviceName: redis-cluster
  replicas: 6
  selector:
    matchLabels:
      app: redis-cluster
  template:
    metadata:
      labels:
        app: redis-cluster
    spec:
      affinity:
        podAntiAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            - labelSelector:
                matchLabels:
                  app: redis-cluster
              topologyKey: kubernetes.io/hostname
      containers:
        - name: redis
          image: redis:7.2-alpine
          ports:
            - containerPort: 6379
              name: redis
            - containerPort: 16379
              name: cluster
          command:
            - redis-server
            - /etc/redis/redis.conf
          resources:
            requests:
              cpu: 2000m
              memory: 10Gi
            limits:
              cpu: 4000m
              memory: 12Gi
          volumeMounts:
            - name: redis-config
              mountPath: /etc/redis
            - name: redis-data
              mountPath: /data
            - name: redis-log
              mountPath: /var/log/redis
          env:
            - name: POD_IP
              valueFrom:
                fieldRef:
                  fieldPath: status.podIP
          livenessProbe:
            exec:
              command:
                - redis-cli
                - ping
            initialDelaySeconds: 30
            periodSeconds: 10
            timeoutSeconds: 5
          readinessProbe:
            exec:
              command:
                - redis-cli
                - ping
            initialDelaySeconds: 5
            periodSeconds: 5
            timeoutSeconds: 3
      volumes:
        - name: redis-config
          configMap:
            name: redis-cluster-config
        - name: redis-log
          emptyDir: {}
  volumeClaimTemplates:
    - metadata:
        name: redis-data
      spec:
        accessModes: ["ReadWriteOnce"]
        storageClassName: ssd-storage
        resources:
          requests:
            storage: 50Gi

---
apiVersion: v1
kind: Service
metadata:
  name: redis-cluster
  namespace: redis
spec:
  type: ClusterIP
  clusterIP: None
  ports:
    - port: 6379
      targetPort: 6379
      name: redis
    - port: 16379
      targetPort: 16379
      name: cluster
  selector:
    app: redis-cluster

---
apiVersion: v1
kind: Service
metadata:
  name: redis-cluster-access
  namespace: redis
spec:
  type: ClusterIP
  ports:
    - port: 6379
      targetPort: 6379
      name: redis
  selector:
    app: redis-cluster

1.4 Docker Compose 快速部署(测试环境)

# docker-compose.yml - Redis Cluster 测试环境
version: '3.8'

services:
  redis-node-1:
    image: redis:7.2-alpine
    command: redis-server /etc/redis/redis.conf
    ports:
      - "6379:6379"
      - "16379:16379"
    volumes:
      - ./redis.conf:/etc/redis/redis.conf
      - redis-data-1:/data
    networks:
      redis-net:
        ipv4_address: 172.30.0.11

  redis-node-2:
    image: redis:7.2-alpine
    command: redis-server /etc/redis/redis.conf
    ports:
      - "6380:6379"
      - "16380:16379"
    volumes:
      - ./redis.conf:/etc/redis/redis.conf
      - redis-data-2:/data
    networks:
      redis-net:
        ipv4_address: 172.30.0.12

  redis-node-3:
    image: redis:7.2-alpine
    command: redis-server /etc/redis/redis.conf
    ports:
      - "6381:6379"
      - "16381:16379"
    volumes:
      - ./redis.conf:/etc/redis/redis.conf
      - redis-data-3:/data
    networks:
      redis-net:
        ipv4_address: 172.30.0.13

  redis-node-4:
    image: redis:7.2-alpine
    command: redis-server /etc/redis/redis.conf
    ports:
      - "6382:6379"
      - "16382:16379"
    volumes:
      - ./redis.conf:/etc/redis/redis.conf
      - redis-data-4:/data
    networks:
      redis-net:
        ipv4_address: 172.30.0.14

  redis-node-5:
    image: redis:7.2-alpine
    command: redis-server /etc/redis/redis.conf
    ports:
      - "6383:6379"
      - "16383:16379"
    volumes:
      - ./redis.conf:/etc/redis/redis.conf
      - redis-data-5:/data
    networks:
      redis-net:
        ipv4_address: 172.30.0.15

  redis-node-6:
    image: redis:7.2-alpine
    command: redis-server /etc/redis/redis.conf
    ports:
      - "6384:6379"
      - "16384:16379"
    volumes:
      - ./redis.conf:/etc/redis/redis.conf
      - redis-data-6:/data
    networks:
      redis-net:
        ipv4_address: 172.30.0.16

  # 集群初始化容器
  redis-cluster-init:
    image: redis:7.2-alpine
    depends_on:
      - redis-node-1
      - redis-node-2
      - redis-node-3
      - redis-node-4
      - redis-node-5
      - redis-node-6
    command: >
      sh -c "sleep 10 &&
        redis-cli --cluster create
        172.30.0.11:6379 172.30.0.12:6379 172.30.0.13:6379
        172.30.0.14:6379 172.30.0.15:6379 172.30.0.16:6379
        --cluster-replicas 1 --cluster-yes"
    networks:
      - redis-net

networks:
  redis-net:
    driver: bridge
    ipam:
      config:
        - subnet: 172.30.0.0/24

volumes:
  redis-data-1:
  redis-data-2:
  redis-data-3:
  redis-data-4:
  redis-data-5:
  redis-data-6:

二、关键参数调优

2.1 Redis 核心配置详解

# ============================================
# /etc/redis/redis.conf - 生产环境配置详解
# ============================================

# ----------------------------------------
# 网络配置
# ----------------------------------------
# 绑定地址,生产环境建议绑定内网 IP
bind 0.0.0.0

# 端口号
port 6379

# TCP 连接队列长度,需配合内核 somaxconn 调整
tcp-backlog 511

# TCP keepalive,检测死连接
tcp-keepalive 300

# 客户端空闲超时(0 表示禁用)
timeout 0

# ----------------------------------------
# 通用配置
# ----------------------------------------
# 是否以守护进程运行(Docker/K8s 环境设为 no)
daemonize no

# PID 文件路径
pidfile /var/run/redis/redis-server.pid

# 日志级别: debug, verbose, notice, warning
loglevel notice

# 日志文件路径
logfile /var/log/redis/redis.log

# 数据库数量
databases 16

# 启动时是否显示 Logo
always-show-logo no

# ----------------------------------------
# 快照配置 (RDB)
# ----------------------------------------
# 保存策略:秒内至少 N 次变更触发快照
# 生产环境根据数据重要性调整
save 900 1      # 15分钟内至少1次变更
save 300 10     # 5分钟内至少10次变更
save 60 10000   # 1分钟内至少10000次变更

# BGSAVE 失败时停止写入
stop-writes-on-bgsave-error yes

# RDB 压缩(消耗 CPU,节省空间)
rdbcompression yes

# RDB 校验(约 10% 性能开销)
rdbchecksum yes

# RDB 文件名
dbfilename dump.rdb

# 数据目录
dir /data

# ----------------------------------------
# 复制配置
# ----------------------------------------
# 副本只读
replica-read-only yes

# 复制积压缓冲区大小(PSYNC 增量同步用)
repl-backlog-size 256mb

# 复制积压缓冲区存活时间
repl-backlog-ttl 3600

# 复制连接断开后是否同步 RDB
# no: 只使用 PSYNC 增量同步
replica-serve-stale-data yes

# 复制同步策略:diskless 直接通过网络传输
repl-diskless-sync yes
repl-diskless-sync-delay 5

# 复制超时时间
repl-timeout 60

# ----------------------------------------
# 安全配置
# ----------------------------------------
# 访问密码(建议使用强密码)
requirepass your_strong_password_here

# 主从复制密码
masterauth your_strong_password_here

# 危险命令重命名(安全加固)
rename-command FLUSHDB ""
rename-command FLUSHALL ""
rename-command KEYS ""
rename-command CONFIG ""
rename-command DEBUG ""
rename-command SHUTDOWN "SHUTDOWN_$(echo $RANDOM | md5sum | head -c 16)"

# ----------------------------------------
# 客户端限制
# ----------------------------------------
# 最大客户端连接数
maxclients 10000

# 客户端输出缓冲区限制
# 格式: <class> <hard limit> <soft limit> <soft seconds>
# normal: 普通客户端
# replica: 副本客户端
# pubsub: 发布订阅客户端
client-output-buffer-limit normal 0 0 0
client-output-buffer-limit replica 256mb 64mb 60
client-output-buffer-limit pubsub 32mb 8mb 60

# 查询缓冲区大小限制
client-query-buffer-limit 1gb

# ----------------------------------------
# 内存管理
# ----------------------------------------
# 最大内存限制(建议留 20% 给系统)
maxmemory 8gb

# 内存淘汰策略:
# - noeviction: 不淘汰,内存满时拒绝写入
# - allkeys-lru: 从所有键中淘汰最近最少使用
# - volatile-lru: 从过期键中淘汰最近最少使用
# - allkeys-lfu: 从所有键中淘汰最不常用
# - volatile-lfu: 从过期键中淘汰最不常用
# - allkeys-random: 随机淘汰
# - volatile-random: 随机淘汰过期键
# - volatile-ttl: 淘汰即将过期的键
maxmemory-policy allkeys-lru

# LRU/LFU 采样数量,越大越精确但越慢
maxmemory-samples 5

# 副本忽略 maxmemory(保证复制完整性)
replica-ignore-maxmemory yes

# ----------------------------------------
# 懒删除(异步删除大 Key)
# ----------------------------------------
# 异步删除过期键
lazyfree-lazy-eviction no
lazyfree-lazy-expire no
lazyfree-lazy-server-del no
replica-lazy-flush no

# ----------------------------------------
# AOF 持久化
# ----------------------------------------
# 启用 AOF
appendonly yes

# AOF 文件名
appendfilename "appendonly.aof"

# AOF 同步策略:
# - always: 每次写入都同步(最安全,最慢)
# - everysec: 每秒同步(推荐)
# - no: 由操作系统决定(最快,可能丢数据)
appendfsync everysec

# AOF 重写期间是否禁用 fsync
no-appendfsync-on-rewrite no

# AOF 重写触发条件
auto-aof-rewrite-percentage 100
auto-aof-rewrite-min-size 64mb

# AOF 文件加载时忽略末尾截断
aof-load-truncated yes

# 启用 RDB-AOF 混合持久化(Redis 4.0+)
aof-use-rdb-preamble yes

# ----------------------------------------
# Lua 脚本
# ----------------------------------------
# Lua 脚本最大执行时间(毫秒),0 表示无限制
lua-time-limit 5000

# ----------------------------------------
# Redis 集群配置
# ----------------------------------------
# 启用集群模式
cluster-enabled yes

# 集群配置文件(自动生成)
cluster-config-file /data/nodes.conf

# 节点超时时间(毫秒)
cluster-node-timeout 5000

# 从节点有效因子
cluster-slave-validity-factor 10

# 是否要求完整覆盖(槽位不全时是否响应)
cluster-require-full-coverage yes

# 迁移屏障
cluster-migration-barrier 1

# ----------------------------------------
# 慢查询日志
# ----------------------------------------
# 慢查询阈值(微秒),负数表示禁用
slowlog-log-slower-than 10000

# 慢查询日志最大长度
slowlog-max-len 128

# ----------------------------------------
# 延迟监控
# ----------------------------------------
# 延迟监控阈值(毫秒),0 表示禁用
latency-monitor-threshold 100

# ----------------------------------------
# 事件通知
# ----------------------------------------
# 键空间通知(按需开启)
notify-keyspace-events ""

# ----------------------------------------
# 高级配置
# ----------------------------------------
# Hash 数据结构优化阈值
hash-max-ziplist-entries 512
hash-max-ziplist-value 64

# List 数据结构优化阈值
list-max-ziplist-size -2
list-compress-depth 0

# Set 数据结构优化阈值
set-max-intset-entries 512

# Sorted Set 数据结构优化阈值
zset-max-ziplist-entries 128
zset-max-ziplist-value 64

# HyperLogLog 稀疏表示阈值
hll-sparse-max-bytes 3000

# Stream 数据结构优化阈值
stream-node-max-bytes 4096
stream-node-max-entries 100

# 主动碎片整理
activedefrag yes
active-defrag-ignore-bytes 100mb
active-defrag-threshold-lower 10
active-defrag-threshold-upper 100
active-defrag-cycle-min 1
active-defrag-cycle-max 25

2.2 Sentinel 配置详解

# /etc/redis/sentinel.conf - 哨兵配置
port 26379
daemonize no
pidfile /var/run/redis/redis-sentinel.pid
logfile /var/log/redis/sentinel.log
dir /tmp

# 监控主节点
# mymaster: 主节点名称
# 127.0.0.1 6379: 主节点地址
# 2: 判定主节点下线所需的哨兵数量
sentinel monitor mymaster 127.0.0.1 6379 2

# 主节点密码
sentinel auth-pass mymaster your_strong_password_here

# 判定主节点主观下线的超时时间(毫秒)
sentinel down-after-milliseconds mymaster 30000

# 故障转移超时时间(毫秒)
sentinel failover-timeout mymaster 180000

# 同时可以对新主节点进行复制的副本数量
sentinel parallel-syncs mymaster 1

# 故障转移后执行脚本
sentinel client-reconfig-script mymaster /etc/redis/failover-notify.sh

# 通知脚本(任何警告级别事件触发)
sentinel notification-script mymaster /etc/redis/notify.sh

三、系统内核/OS 层优化

3.1 系统内核参数调优

# /etc/sysctl.conf - Redis 优化内核参数
# 或放入 /etc/sysctl.d/99-redis.conf

# ============================================
# 内存管理
# ============================================
# 禁用 THP(Transparent Huge Pages)
# THP 会导致 Redis 延迟抖动
echo never > /sys/kernel/mm/transparent_hugepage/enabled
echo never > /sys/kernel/mm/transparent_hugepage/defrag

# 最大内存映射数量(AOF/RDB 需要)
vm.max_map_count = 262144

# 交换分区使用倾向(1 表示尽量避免使用)
vm.swappiness = 1

# 脏页比例(写缓冲)
vm.dirty_ratio = 30
vm.dirty_background_ratio = 10

# ============================================
# 网络优化
# ============================================
# 最大打开文件数
fs.file-max = 1000000

# TCP 连接队列
net.core.somaxconn = 65535
net.ipv4.tcp_max_syn_backlog = 65535

# TCP 连接复用
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_fin_timeout = 30

# TCP Keepalive
net.ipv4.tcp_keepalive_time = 300
net.ipv4.tcp_keepalive_intvl = 30
net.ipv4.tcp_keepalive_probes = 3

# TCP 内存自动调节
net.ipv4.tcp_mem = 786432 1048576 1572864
net.core.rmem_default = 262144
net.core.rmem_max = 16777216
net.core.wmem_default = 262144
net.core.wmem_max = 16777216

# 网卡队列长度
net.core.netdev_max_backlog = 65535

# ============================================
# 进程调度
# ============================================
# 最大进程数
kernel.pid_max = 4194303

# ============================================
# 文件系统
# ============================================
# Inotify 监控数量
fs.inotify.max_user_instances = 8192
fs.inotify.max_user_watches = 524288

# 应用配置
sysctl -p

3.2 用户资源限制

# /etc/security/limits.conf - Redis 用户资源限制

# 最大打开文件数
redis soft nofile 65536
redis hard nofile 65536

# 最大进程数
redis soft nproc 65536
redis hard nproc 65536

# 锁定内存(禁用 swap)
redis soft memlock unlimited
redis hard memlock unlimited

# 最大信号量数量
redis soft sigpending 65536
redis hard sigpending 65536

3.3 系统服务优化

# 禁用 THP 服务
cat > /etc/systemd/system/disable-thp.service << 'EOF'
[Unit]
Description=Disable Transparent Huge Pages (THP)
DefaultDependencies=no
After=sysinit.target local-fs.target
Before=basic.target

[Service]
Type=oneshot
ExecStart=/bin/sh -c 'echo never > /sys/kernel/mm/transparent_hugepage/enabled'
ExecStart=/bin/sh -c 'echo never > /sys/kernel/mm/transparent_hugepage/defrag'

[Install]
WantedBy=basic.target
EOF

systemctl daemon-reload
systemctl enable disable-thp
systemctl start disable-thp

3.4 磁盘 I/O 调度优化

# 查看当前调度器
cat /sys/block/sda/queue/scheduler

# 设置为 noop 或 deadline(SSD 推荐)
echo noop > /sys/block/sda/queue/scheduler
# 或
echo deadline > /sys/block/sda/queue/scheduler

# 永久设置(添加到 /etc/rc.local 或 systemd 服务)
# 对于 SSD,推荐使用 noop
# 对于 HDD,推荐使用 deadline 或 cfq

3.5 NUMA 优化

# 查看 NUMA 拓扑
numactl --hardware

# 绑定 Redis 到特定 NUMA 节点
numactl --cpunodebind=0 --membind=0 /usr/bin/redis-server /etc/redis/redis.conf

# 或使用 systemd 服务配置
cat > /etc/systemd/system/redis.service << 'EOF'
[Unit]
Description=Redis Server
After=network.target

[Service]
Type=notify
User=redis
Group=redis
ExecStart=/usr/bin/redis-server /etc/redis/redis.conf
ExecStop=/usr/bin/redis-cli shutdown
Restart=always
LimitNOFILE=65536
# NUMA 绑定
NUMAPolicy=bind
NUMAMask=0

[Install]
WantedBy=multi-user.target
EOF

四、监控与告警

4.1 Prometheus Redis Exporter 配置

# prometheus.yml - Redis 监控配置
scrape_configs:
  - job_name: 'redis'
    static_configs:
      - targets:
          - redis-node-01:9121
          - redis-node-02:9121
          - redis-node-03:9121
          - redis-node-04:9121
          - redis-node-05:9121
          - redis-node-06:9121
    relabel_configs:
      - source_labels: [__address__]
        regex: '([^:]+):\d+'
        target_label: instance
        replacement: '${1}'

  - job_name: 'redis-cluster'
    static_configs:
      - targets:
          - 'redis-exporter:9121'
    metrics_path: /scrape
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: redis-exporter:9121

4.2 Redis Exporter 启动配置

# docker-compose.yml - Redis Exporter
version: '3.8'
services:
  redis-exporter:
    image: oliver006/redis_exporter:latest
    ports:
      - "9121:9121"
    environment:
      - REDIS_ADDR=redis://redis-node-01:6379
      - REDIS_PASSWORD=your_password
      - REDIS_EXPORTER_DEBUG=false
      - REDIS_EXPORTER_INCL_SYSTEM_METRICS=true
      - REDIS_EXPORTER_SKIP_TLS_VERIFICATION=true
      # 监控多个 Redis 实例
      - REDIS_EXPORTER_REDIS_ONLY_METRICS=false
    command:
      - '--web.listen-address=:9121'
      - '--web.telemetry-path=/metrics'
      - '--log-format=txt'
      - '--check-single-keys=db0=keys-*'
      - '--check-key-groups=db0=count,avg'
    restart: always

4.3 Grafana 面板 JSON

{
  "dashboard": {
    "title": "Redis Cluster Monitor",
    "uid": "redis-cluster-monitor",
    "tags": ["redis", "database", "cluster"],
    "timezone": "browser",
    "refresh": "30s",
    "panels": [
      {
        "title": "Redis 内存使用率",
        "type": "gauge",
        "gridPos": {"h": 8, "w": 6, "x": 0, "y": 0},
        "targets": [
          {
            "expr": "redis_memory_used_bytes / redis_memory_max_bytes * 100",
            "legendFormat": "{{instance}}",
            "refId": "A"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "thresholds": {
              "mode": "absolute",
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 70},
                {"color": "red", "value": 85}
              ]
            }
          }
        }
      },
      {
        "title": "QPS (每秒操作数)",
        "type": "timeseries",
        "gridPos": {"h": 8, "w": 12, "x": 6, "y": 0},
        "targets": [
          {
            "expr": "rate(redis_commands_processed_total[1m])",
            "legendFormat": "{{instance}}",
            "refId": "A"
          }
        ]
      },
      {
        "title": "连接数",
        "type": "timeseries",
        "gridPos": {"h": 8, "w": 6, "x": 0, "y": 8},
        "targets": [
          {
            "expr": "redis_connected_clients",
            "legendFormat": "{{instance}}",
            "refId": "A"
          }
        ]
      },
      {
        "title": "Key 过期/淘汰统计",
        "type": "timeseries",
        "gridPos": {"h": 8, "w": 12, "x": 6, "y": 8},
        "targets": [
          {
            "expr": "rate(redis_expired_keys_total[1m])",
            "legendFormat": "过期 {{instance}}",
            "refId": "A"
          },
          {
            "expr": "rate(redis_evicted_keys_total[1m])",
            "legendFormat": "淘汰 {{instance}}",
            "refId": "B"
          }
        ]
      },
      {
        "title": "复制延迟",
        "type": "timeseries",
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
        "targets": [
          {
            "expr": "redis_replica_lag_seconds",
            "legendFormat": "{{instance}}",
            "refId": "A"
          }
        ]
      },
      {
        "title": "慢日志统计",
        "type": "stat",
        "gridPos": {"h": 4, "w": 6, "x": 12, "y": 16},
        "targets": [
          {
            "expr": "increase(redis_slowlog_length[5m])",
            "legendFormat": "慢查询数",
            "refId": "A"
          }
        ]
      },
      {
        "title": "集群状态",
        "type": "stat",
        "gridPos": {"h": 4, "w": 6, "x": 12, "y": 20},
        "targets": [
          {
            "expr": "redis_cluster_state",
            "legendFormat": "集群状态",
            "refId": "A"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "mappings": [
              {"type": "value", "options": {"0": {"text": "Fail", "color": "red"}}},
              {"type": "value", "options": {"1": {"text": "OK", "color": "green"}}}
            ]
          }
        }
      },
      {
        "title": "持久化状态",
        "type": "table",
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 24},
        "targets": [
          {
            "expr": "redis_rdb_last_bgsave_status",
            "format": "table",
            "refId": "A"
          },
          {
            "expr": "redis_aof_last_bgrewrite_status",
            "format": "table",
            "refId": "B"
          }
        ]
      },
      {
        "title": "网络带宽",
        "type": "timeseries",
        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 24},
        "targets": [
          {
            "expr": "rate(redis_net_input_bytes_total[1m])",
            "legendFormat": "入站 {{instance}}",
            "refId": "A"
          },
          {
            "expr": "rate(redis_net_output_bytes_total[1m])",
            "legendFormat": "出站 {{instance}}",
            "refId": "B"
          }
        ]
      },
      {
        "title": "命令执行时间分布",
        "type": "heatmap",
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 32},
        "targets": [
          {
            "expr": "histogram_quantile(0.99, rate(redis_command_duration_seconds_bucket[5m]))",
            "legendFormat": "P99",
            "refId": "A"
          }
        ],
        "dataFormat": "timeseries"
      },
      {
        "title": "键空间分布",
        "type": "piechart",
        "gridPos": {"h": 8, "w": 6, "x": 12, "y": 32},
        "targets": [
          {
            "expr": "redis_db_keys{db!~\"db[0-9]+_expires\"}",
            "legendFormat": "{{db}}",
            "refId": "A"
          }
        ]
      },
      {
        "title": "客户端连接分布",
        "type": "piechart",
        "gridPos": {"h": 8, "w": 6, "x": 18, "y": 32},
        "targets": [
          {
            "expr": "redis_connected_clients",
            "legendFormat": "{{instance}}",
            "refId": "A"
          }
        ]
      }
    ]
  }
}

4.4 Alertmanager 告警规则

# redis-alerts.yml - Redis 告警规则
groups:
  - name: redis
    rules:
      # ===== 集群状态告警 =====
      - alert: RedisClusterDown
        expr: redis_up == 0
        for: 1m
        labels:
          severity: critical
          team: sre
        annotations:
          summary: "Redis 节点 {{ $labels.instance }} 不可用"
          description: "Redis 节点 {{ $labels.instance }} 已经宕机超过 1 分钟"

      - alert: RedisClusterStateDown
        expr: redis_cluster_state == 0
        for: 30s
        labels:
          severity: critical
          team: sre
        annotations:
          summary: "Redis 集群状态异常"
          description: "Redis 集群状态为 FAIL,可能存在槽位未覆盖"

      - alert: RedisClusterSlotsDown
        expr: redis_cluster_slots_assigned != 16384
        for: 1m
        labels:
          severity: critical
          team: sre
        annotations:
          summary: "Redis 集群槽位分配不完整"
          description: "当前已分配槽位 {{ $value }},应为 16384"

      # ===== 内存告警 =====
      - alert: RedisMemoryHighUsage
        expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 85
        for: 5m
        labels:
          severity: warning
          team: sre
        annotations:
          summary: "Redis 内存使用率过高"
          description: "{{ $labels.instance }} 内存使用率 {{ $value | printf \"%.2f\" }}%"

      - alert: RedisMemoryCriticalUsage
        expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 95
        for: 1m
        labels:
          severity: critical
          team: sre
        annotations:
          summary: "Redis 内存即将耗尽"
          description: "{{ $labels.instance }} 内存使用率 {{ $value | printf \"%.2f\" }}%,请立即处理"

      - alert: RedisOOM
        expr: increase(redis_memory_used_peak_bytes[5m]) > 0 and redis_memory_used_bytes >= redis_memory_max_bytes
        for: 1m
        labels:
          severity: critical
          team: sre
        annotations:
          summary: "Redis 已触发 OOM"
          description: "{{ $labels.instance }} 已达到最大内存限制,开始淘汰键"

      # ===== 连接告警 =====
      - alert: RedisTooManyConnections
        expr: redis_connected_clients / redis_config_maxclients * 100 > 80
        for: 5m
        labels:
          severity: warning
          team: sre
        annotations:
          summary: "Redis 连接数过高"
          description: "{{ $labels.instance }} 连接数 {{ $value | printf \"%.2f\" }}%"

      - alert: RedisRejectedConnections
        expr: increase(redis_rejected_connections_total[5m]) > 0
        for: 1m
        labels:
          severity: critical
          team: sre
        annotations:
          summary: "Redis 拒绝连接"
          description: "{{ $labels.instance }} 因 maxclients 限制拒绝连接"

      # ===== 复制告警 =====
      - alert: RedisReplicaLag
        expr: redis_replica_lag_seconds > 10
        for: 5m
        labels:
          severity: warning
          team: sre
        annotations:
          summary: "Redis 副本同步延迟过高"
          description: "{{ $labels.instance }} 副本延迟 {{ $value }} 秒"

      - alert: RedisReplicaDisconnected
        expr: redis_replica_connected == 0 and redis_instance_role == "slave"
        for: 1m
        labels:
          severity: critical
          team: sre
        annotations:
          summary: "Redis 副本与主节点断开"
          description: "{{ $labels.instance }} 副本连接已断开"

      # ===== 持久化告警 =====
      - alert: RedisRDBSaveFailed
        expr: redis_rdb_last_bgsave_status != 1
        for: 1m
        labels:
          severity: critical
          team: sre
        annotations:
          summary: "Redis RDB 快照保存失败"
          description: "{{ $labels.instance }} 最近一次 BGSAVE 失败"

      - alert: RedisAOFRewriteFailed
        expr: redis_aof_last_bgrewrite_status != 1
        for: 1m
        labels:
          severity: critical
          team: sre
        annotations:
          summary: "Redis AOF 重写失败"
          description: "{{ $labels.instance }} 最近一次 AOF 重写失败"

      - alert: RedisPersistenceStalled
        expr: time() - redis_rdb_last_save_timestamp_seconds > 3600 and redis_rdb_changes_since_last_save > 0
        for: 5m
        labels:
          severity: warning
          team: sre
        annotations:
          summary: "Redis 持久化长时间未执行"
          description: "{{ $labels.instance }} 超过 1 小时未执行 RDB 快照"

results matching ""

    No results matching ""