SRE 话题文档:Linux 内核调优与性能分析
本文档面向生产环境,涵盖 Linux 内核参数调优、性能监控、故障排查等核心运维场景。
1. 生产环境部署架构
1.1 架构图(ASCII)
┌─────────────────────────────────────────────────────────────────────────────┐
│ 生产环境 Linux 集群架构 │
└─────────────────────────────────────────────────────────────────────────────┘
┌─────────────────┐
│ Load Balancer │
│ (HAProxy/Nginx)│
└────────┬────────┘
│
┌──────────────────┼──────────────────┐
│ │ │
▼ ▼ ▼
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
│ Node-01 │ │ Node-02 │ │ Node-03 │
│ (Kernel 5.15)│ │ (Kernel 5.15)│ │ (Kernel 5.15)│
│ │ │ │ │ │
│ ┌───────────┐ │ │ ┌───────────┐ │ │ ┌───────────┐ │
│ │ Kubelet │ │ │ │ Kubelet │ │ │ │ Kubelet │ │
│ ├───────────┤ │ │ ├───────────┤ │ │ ├───────────┤ │
│ │ Runtime │ │ │ │ Runtime │ │ │ │ Runtime │ │
│ │ (containerd)│ │ │ │(containerd)│ │ │ │(containerd)│
│ └───────────┘ │ │ └───────────┘ │ │ └───────────┘ │
└───────┬───────┘ └───────┬───────┘ └───────┬───────┘
│ │ │
└──────────────────┼──────────────────┘
│
┌───────┴───────┐
│ Shared Storage│
│ (Ceph/NFS) │
└───────────────┘
┌─────────────────────────────────────────────────────────────────────────────┐
│ 监控栈 (独立部署) │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Prometheus │ │ Grafana │ │Alertmanager │ │ Node Exporter│ │
│ │ (TSDB) │ │ (可视化) │ │ (告警) │ │ (指标采集) │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ │
└─────────────────────────────────────────────────────────────────────────────┘
1.2 Kubernetes 部署配置
# node-exporter-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: monitoring
labels:
app: node-exporter
spec:
selector:
matchLabels:
app: node-exporter
template:
metadata:
labels:
app: node-exporter
spec:
hostPID: true
hostNetwork: true
containers:
- name: node-exporter
image: prom/node-exporter:v1.7.0
args:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--path.rootfs=/host/root'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
- '--collector.netclass.ignored-devices=^(veth.*)$$'
- '--collector.netdev.device-exclude=^(veth.*)$$'
ports:
- name: metrics
containerPort: 9100
hostPort: 9100
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 200m
memory: 200Mi
volumeMounts:
- name: proc
mountPath: /host/proc
readOnly: true
- name: sys
mountPath: /host/sys
readOnly: true
- name: root
mountPath: /host/root
readOnly: true
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: root
hostPath:
path: /
tolerations:
- effect: NoSchedule
operator: Exists
# kernel-tuner-daemonset.yaml - 内核参数自动调优
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: kernel-tuner
namespace: kube-system
labels:
app: kernel-tuner
spec:
selector:
matchLabels:
app: kernel-tuner
template:
metadata:
labels:
app: kernel-tuner
spec:
hostPID: true
hostNetwork: true
initContainers:
- name: sysctl-tuner
image: alpine:3.19
securityContext:
privileged: true
command: ["/bin/sh", "-c"]
args:
- |
# 网络性能优化
sysctl -w net.core.somaxconn=65535
sysctl -w net.core.netdev_max_backlog=65535
sysctl -w net.ipv4.tcp_max_syn_backlog=65535
sysctl -w net.ipv4.tcp_tw_reuse=1
sysctl -w net.ipv4.tcp_fin_timeout=15
sysctl -w net.ipv4.tcp_keepalive_time=300
sysctl -w net.ipv4.tcp_keepalive_probes=3
sysctl -w net.ipv4.tcp_keepalive_intvl=30
sysctl -w net.ipv4.tcp_syncookies=1
sysctl -w net.ipv4.tcp_max_tw_buckets=65535
sysctl -w net.ipv4.ip_local_port_range="1024 65535"
# 内存优化
sysctl -w vm.swappiness=10
sysctl -w vm.dirty_ratio=15
sysctl -w vm.dirty_background_ratio=5
sysctl -w vm.overcommit_memory=1
sysctl -w vm.max_map_count=262144
# 文件描述符
sysctl -w fs.file-max=2097152
sysctl -w fs.nr_open=2097152
sysctl -w fs.inotify.max_user_instances=8192
sysctl -w fs.inotify.max_user_watches=524288
echo "Kernel parameters tuned successfully"
containers:
- name: pause
image: registry.k8s.io/pause:3.9
nodeSelector:
kubernetes.io/os: linux
1.3 Docker Compose 部署(单机/测试环境)
# docker-compose.yml - 监控栈
version: '3.8'
services:
prometheus:
image: prom/prometheus:v2.48.0
container_name: prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus-data:/prometheus
restart: unless-stopped
networks:
- monitoring
grafana:
image: grafana/grafana:10.2.0
container_name: grafana
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-piechart-panel
ports:
- "3000:3000"
volumes:
- grafana-data:/var/lib/grafana
- ./dashboards:/etc/grafana/provisioning/dashboards
restart: unless-stopped
networks:
- monitoring
depends_on:
- prometheus
node-exporter:
image: prom/node-exporter:v1.7.0
container_name: node-exporter
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--path.rootfs=/host/root'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
pid: host
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/host/root:ro
restart: unless-stopped
networks:
- monitoring
alertmanager:
image: prom/alertmanager:v0.26.0
container_name: alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager-data:/alertmanager
restart: unless-stopped
networks:
- monitoring
volumes:
prometheus-data:
grafana-data:
alertmanager-data:
networks:
monitoring:
driver: bridge
2. 关键参数调优
2.1 核心配置文件详解
/etc/sysctl.conf - 内核参数主配置
# ==============================================================================
# Linux 内核性能调优配置
# 适用场景:高并发 Web 服务、数据库服务器、容器宿主机
# 内核版本:5.x (兼容 4.x)
# ==============================================================================
# ------------------------------------------------------------------------------
# 网络层优化
# ------------------------------------------------------------------------------
# 系统最大监听队列长度,影响高并发连接建立
# 默认值 128,生产环境建议 65535
net.core.somaxconn = 65535
# 网络设备积压队列,当网络接口接收数据包速度快于内核处理速度时使用
# 默认值 1000,高负载环境建议 65535
net.core.netdev_max_backlog = 65535
# TCP 半连接队列长度,SYN Flood 攻击防护的关键参数
# 默认值 512,建议 65535
net.ipv4.tcp_max_syn_backlog = 65535
# TIME_WAIT 状态连接复用,允许将 TIME_WAIT 连接重新用于新的 TCP 连接
# 默认值 0,生产环境建议 1(开启)
net.ipv4.tcp_tw_reuse = 1
# TIME_WAIT 状态持续时间(秒),缩短可加快连接回收
# 默认值 60,建议 15-30
net.ipv4.tcp_fin_timeout = 15
# TCP Keepalive 探测时间(秒),检测死连接
# 默认值 7200(2小时),建议 300(5分钟)
net.ipv4.tcp_keepalive_time = 300
# Keepalive 探测失败后重试次数
# 默认值 9,建议 3
net.ipv4.tcp_keepalive_probes = 3
# Keepalive 探测间隔(秒)
# 默认值 75,建议 30
net.ipv4.tcp_keepalive_intvl = 30
# SYN Cookies 防护,防止 SYN Flood 攻击
# 0=关闭, 1=开启
net.ipv4.tcp_syncookies = 1
# 系统 TIME_WAIT 连接最大数量,超过后新连接直接关闭
# 默认值 4096,高并发建议 65535
net.ipv4.tcp_max_tw_buckets = 65535
# 本地端口范围,限制临时端口分配
# 默认值 "32768 60999",建议扩展
net.ipv4.ip_local_port_range = 1024 65535
# TCP 窗口缩放因子,支持大于 64KB 的窗口
net.ipv4.tcp_window_scaling = 1
# TCP 最大接收/发送缓冲区(字节)
net.core.rmem_max = 16777216
net.core.wmem_max = 16777216
net.ipv4.tcp_rmem = 4096 87380 16777216
net.ipv4.tcp_wmem = 4096 65536 16777216
# 禁用 IPv6(如不需要)
net.ipv6.conf.all.disable_ipv6 = 1
net.ipv6.conf.default.disable_ipv6 = 1
# ------------------------------------------------------------------------------
# 内存层优化
# ------------------------------------------------------------------------------
# Swap 使用倾向,值越小越倾向使用物理内存
# 默认值 60,数据库/缓存服务器建议 1-10
vm.swappiness = 10
# 脏页占内存百分比阈值,超过后内核强制同步写入磁盘
# 默认值 20,高性能 SSD 可设为 15
vm.dirty_ratio = 15
# 后台刷脏页阈值,达到后内核后台异步写入
# 默认值 10,建议低于 dirty_ratio
vm.dirty_background_ratio = 5
# 内存过量分配策略
# 0=启发式, 1=总是允许, 2=禁止
# 数据库服务器建议 0,Web 服务器建议 1
vm.overcommit_memory = 1
# 进程最大内存映射区域数量
# 默认值 65530,Elasticsearch/Redis 等需要更大
vm.max_map_count = 262144
# 最小空闲内存页数,保持系统稳定性
vm.min_free_kbytes = 65536
# 透明大页,数据库建议关闭
# always/madvise/never
vm.transparent_hugepage = madvise
# ------------------------------------------------------------------------------
# 文件系统优化
# ------------------------------------------------------------------------------
# 系统级最大打开文件数
# 默认值约 10万,高并发建议 200万+
fs.file-max = 2097152
# 单进程最大文件描述符
fs.nr_open = 2097152
# inotify 实例数量限制
fs.inotify.max_user_instances = 8192
# inotify 监控文件数量限制
fs.inotify.max_user_watches = 524288
# 文件描述符队列事件数量
fs.inotify.max_queued_events = 16384
# ------------------------------------------------------------------------------
# 内核调度优化
# ------------------------------------------------------------------------------
# 进程最大数量
kernel.pid_max = 4194303
# 线程最大数量
kernel.threads-max = 4194303
# 核心转储文件名格式
kernel.core_pattern = /var/core/core.%e.%p.%t
# 开启核心转储
kernel.core_uses_pid = 1
/etc/security/limits.conf - 用户资源限制
# ==============================================================================
# 用户资源限制配置
# 路径:/etc/security/limits.conf
# 说明:需要确保 PAM 模块加载(/etc/pam.d/common-session 包含 session required pam_limits.so)
# ==============================================================================
# 用户 'nobody' - Web 服务器/容器常用
nobody soft nofile 655350
nobody hard nofile 655350
nobody soft nproc 65535
nobody hard nproc 65535
nobody soft memlock unlimited
nobody hard memlock unlimited
# 用户 'root' - 系统管理
root soft nofile 655350
root hard nofile 655350
root soft nproc unlimited
root hard nproc unlimited
root soft memlock unlimited
root hard memlock unlimited
# 用户 'mysql' - 数据库服务
mysql soft nofile 655350
mysql hard nofile 655350
mysql soft nproc 65535
mysql hard nproc 65535
# 用户 'redis' - 缓存服务
redis soft nofile 655350
redis hard nofile 655350
redis soft memlock unlimited
redis hard memlock unlimited
# 用户 'elasticsearch' - 搜索引擎
elasticsearch soft nofile 655350
elasticsearch hard nofile 655350
elasticsearch soft memlock unlimited
elasticsearch hard memlock unlimited
elasticsearch soft nproc 65535
elasticsearch hard nproc 65535
# 用户 'nginx' - Web 服务
nginx soft nofile 655350
nginx hard nofile 655350
nginx soft nproc 65535
nginx hard nproc 65535
# 通配符 - 所有用户默认(谨慎使用)
* soft nofile 655350
* hard nofile 655350
# ==============================================================================
# 说明:
# - soft: 软限制,用户可以临时修改(ulimit -n)
# - hard: 硬限制,软限制的上限
# - nofile: 打开文件描述符数量
# - nproc: 进程数量
# - memlock: 锁定内存大小(Redis/ES 需要)
# - stack: 栈大小
# ==============================================================================
/etc/systemd/system.conf - Systemd 全局配置
# ==============================================================================
# Systemd 全局资源限制
# 路径:/etc/systemd/system.conf
# 适用:由 systemd 管理的服务
# ==============================================================================
[Manager]
# 默认限制(所有服务)
DefaultLimitNOFILE=655350
DefaultLimitNPROC=655350
DefaultLimitMEMLOCK=infinity
DefaultLimitCORE=infinity
# 日志配置
LogLevel=info
LogTarget=journal
# CPU 调度
#DefaultCPUAccounting=yes
#DefaultMemoryAccounting=yes
#DefaultIOAccounting=yes
3. 系统内核/OS 层优化
3.1 内核启动参数优化
/etc/default/grub
# GRUB 内核启动参数
# 修改后执行:update-grub && reboot
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash"
GRUB_CMDLINE_LINUX="\
console=tty0 \
console=ttyS0,115200n8 \
# 禁用 NUMA 平衡(数据库服务器)
numa=off \
# 大页内存配置(数据库/HPC)
default_hugepagesz=1G hugepagesz=1G hugepages=16 \
# cgroup v2 支持(Kubernetes 1.25+)
systemd.unified_cgroup_hierarchy=1 \
# 禁用透明大页(数据库)
transparent_hugepage=never \
# I/O 调度器
elevator=deadline \
# 禁用节能(性能优先)
intel_idle.max_cstate=1 \
processor.max_cstate=1 \
# 网络队列
net.ifnames=0 biosdevname=0 \
# 透明大页
transparent_hugepage=madvise \
# 崩溃转储
crashkernel=256M"
3.2 磁盘 I/O 调度器配置
# ==============================================================================
# I/O 调度器配置
# 路径:/sys/block/<device>/queue/scheduler
# 可用调度器:noop, deadline, cfq, mq-deadline, bfq
# ==============================================================================
# SSD/NVMe - 使用 noop 或 mq-deadline
echo noop > /sys/block/nvme0n1/queue/scheduler
echo mq-deadline > /sys/block/sda/queue/scheduler
# HDD - 使用 bfq 或 cfq
echo bfq > /sys/block/sdb/queue/scheduler
# 永久配置 - 通过 udev 规则
cat > /etc/udev/rules.d/60-io-scheduler.rules << 'EOF'
# NVMe SSD
ACTION=="add|change", KERNEL=="nvme[0-9]n[0-9]", ATTR{queue/scheduler}="none"
# SATA SSD
ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="0", ATTR{queue/scheduler}="mq-deadline"
# HDD
ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="1", ATTR{queue/scheduler}="bfq"
EOF
# 应用 udev 规则
udevadm control --reload-rules
udevadm trigger
# ==============================================================================
# 调度器选择建议:
# - NVMe SSD: none (无调度,硬件队列自行管理)
# - SATA SSD: mq-deadline (多队列截止时间调度)
# - HDD: bfq (预算公平队列,适合交互式负载)
# - 数据库: deadline/mq-deadline (降低延迟抖动)
# ==============================================================================
3.3 网卡中断优化(RSS/RPS)
# ==============================================================================
# 网卡中断优化
# 目的:将网卡中断分散到多个 CPU 核心,避免单核瓶颈
# ==============================================================================
# 查看网卡队列数量
ethtool -l eth0
# 设置 RSS(Receive Side Scaling)队列
ethtool -L eth0 combined 8
# 设置 RPS(Receive Packet Steering)CPU 掩码
# 将流量分散到 CPU 0-7
for q in /sys/class/net/eth0/queues/rx-*; do
echo ff > $q/rps_cpus
done
# 设置 XPS(Transmit Packet Steering)
for q in /sys/class/net/eth0/queues/tx-*; do
echo ff > $q/xps_cpus
done
# 中断亲和性脚本(自动绑定中断到各核)
#!/bin/bash
# irq-affinity.sh - 绑定网卡中断到 CPU 核心
DEV=eth0
IRQS=$(grep $DEV /proc/interrupts | awk '{print $1}' | tr -d ':')
CPU=0
CPU_COUNT=$(nproc)
for IRQ in $IRQS; do
MASK=$(echo "obase=16; $((1 << CPU))" | bc)
echo $MASK > /proc/irq/$IRQ/smp_affinity
CPU=$(( (CPU + 1) % CPU_COUNT ))
done
3.4 CPU 频率调节
# ==============================================================================
# CPU 频率调度策略
# 路径:/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
# 可用策略:performance, powersave, ondemand, conservative, userspace
# ==============================================================================
# 查看当前策略
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
# 设置为性能模式(关闭节能)
for cpu in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
echo performance > $cpu
done
# 或者使用 cpupower 工具
cpupower frequency-set -g performance
# 禁用 Intel P-State 节能(BIOS 或内核参数)
# 在 /etc/default/grub 添加:
# GRUB_CMDLINE_LINUX="intel_pstate=disable"
3.5 内存大页配置
# ==============================================================================
# HugePages 配置 - 适用于数据库、虚拟化
# ==============================================================================
# 查看当前大页配置
grep Huge /proc/meminfo
# 计算:HugePages = (目标内存MB * 1024) / HugepagesizeKB
# 例如:8GB 内存用于大页,页面大小 2MB
# HugePages = (8192 * 1024) / 2048 = 4096
# 临时配置
echo 4096 > /proc/sys/vm/nr_hugepages
# 永久配置 - /etc/sysctl.conf
vm.nr_hugepages = 4096
# 1GB 大页(需要内核支持)
# 内核参数:default_hugepagesz=1G hugepagesz=1G hugepages=16
# 挂载 hugetlbfs
mkdir -p /dev/hugepages
mount -t hugetlbfs nodev /dev/hugepages
# /etc/fstab 永久挂载
hugetlbfs /dev/hugepages hugetlbfs mode=1770,gid=hugetlbfs 0 0
# 创建 hugetlbfs 组
groupadd hugetlbfs
usermod -a -G hugetlbfs mysql # 数据库用户
4. 监控与告警
4.1 Prometheus 配置
prometheus.yml
# ==============================================================================
# Prometheus 配置
# ==============================================================================
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'production'
env: 'prod'
# 告警规则文件
rule_files:
- "alerts/*.yml"
# Alertmanager 配置
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# 数据采集配置
scrape_configs:
# Node Exporter - 主机指标
- job_name: 'node-exporter'
static_configs:
- targets:
- 'node-01:9100'
- 'node-02:9100'
- 'node-03:9100'
labels:
group: 'production'
# Prometheus 自身监控
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# 内核指标采集(使用 node_exporter textfile)
- job_name: 'kernel-metrics'
static_configs:
- targets: ['localhost:9100']
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: 'kernel-stats'
# 远程存储(可选)
# remote_write:
# - url: "http://remote-storage:9090/api/v1/write"
4.2 内核监控告警规则
alerts/kernel-alerts.yml
# ==============================================================================
# 内核性能告警规则
# ==============================================================================
groups:
- name: kernel-alerts
rules:
# CPU 使用率告警
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "CPU 使用率过高"
description: "实例 {{ $labels.instance }} CPU 使用率 {{ $value | printf \"%.1f\" }}%"
- alert: CriticalCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
for: 2m
labels:
severity: critical
annotations:
summary: "CPU 使用率严重过高"
description: "实例 {{ $labels.instance }} CPU 使用率 {{ $value | printf \"%.1f\" }}%,需立即处理"
# 内存使用告警
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "内存使用率过高"
description: "实例 {{ $labels.instance }} 内存使用率 {{ $value | printf \"%.1f\" }}%"
- alert: CriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 2m
labels:
severity: critical
annotations:
summary: "内存即将耗尽"
description: "实例 {{ $labels.instance }} 内存使用率 {{ $value | printf \"%.1f\" }}%,可能触发 OOM"
# Swap 使用告警
- alert: SwapUsage
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Swap 使用过高"
description: "实例 {{ $labels.instance }} Swap 使用率 {{ $value | printf \"%.1f\" }}%,性能可能下降"
# 磁盘使用告警
- alert: DiskSpaceWarning
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "磁盘空间不足"
description: "实例 {{ $labels.instance }} 挂载点 {{ $labels.mountpoint }} 使用率 {{ $value | printf \"%.1f\" }}%"
- alert: DiskSpaceCritical
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 95
for: 2m
labels:
severity: critical
annotations:
summary: "磁盘空间严重不足"
description: "实例 {{ $labels.instance }} 挂载点 {{ $labels.mountpoint }} 使用率 {{ $value | printf \"%.1f\" }}%"
# I/O 等待告警
- alert: HighIOWait
expr: avg by(instance) (irate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 20
for: 5m
labels:
severity: warning
annotations:
summary: "I/O 等待过高"
description: "实例 {{ $labels.instance }} I/O 等待时间 {{ $value | printf \"%.1f\" }}%,磁盘可能有瓶颈"
# 网络错误告警
- alert: NetworkReceiveErrors
expr: rate(node_network_receive_errs_total[5m]) > 100
for: 5m
labels:
severity: warning
annotations:
summary: "网络接收错误过多"
description: "实例 {{ $labels.instance }} 网卡 {{ $labels.device }} 接收错误率过高"
- alert: NetworkTransmitErrors
expr: rate(node_network_transmit_errs_total[5m]) > 100
for: 5m
labels:
severity: warning
annotations:
summary: "网络发送错误过多"
description: "实例 {{ $labels.instance }} 网卡 {{ $labels.device }} 发送错误率过高"
# TCP 连接状态告警
- alert: TCPTimeWaitHigh
expr: node_netstat_Tcp_TimeWait / node_netstat_Tcp_CurrEstab > 100
for: 5m
labels:
severity: warning
annotations:
summary: "TIME_WAIT 连接过多"
description: "实例 {{ $labels.instance }} TIME_WAIT 连接数 {{ $value | printf \"%.0f\" }} 倍于已建立连接"
- alert: TCPListenDrops
expr: rate(node_netstat_Tcp_ListenDrops[5m]) > 0
for: 2m
labels:
severity: critical
annotations:
summary: "TCP 监听队列丢弃"
description: "实例 {{ $labels.instance }} TCP 监听队列正在丢弃连接,检查 somaxconn 配置"
# 文件描述符告警
- alert: FileDescriptorUsage
expr: (node_filefd_allocated / node_filefd_maximum) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "文件描述符使用过高"
description: "实例 {{ $labels.instance }} 文件描述符使用率 {{ $value | printf \"%.1f\" }}%"
# 进程状态告警
- alert: UninterruptibleProcesses
expr: node_procs_blocked > 100
for: 5m
labels:
severity: warning
annotations:
summary: "大量进程处于不可中断状态"
description: "实例 {{ $labels.instance }} 有 {{ $value }} 个进程处于 D 状态,可能有 I/O 问题"
# Load Average 告警
- alert: HighLoadAverage
expr: node_load15 / count(node_cpu_seconds_total{mode="idle"}) without(cpu, mode) > 2
for: 10m
labels:
severity: warning
annotations:
summary: "系统负载过高"
description: "实例 {{ $labels.instance }} 15分钟负载 {{ $value | printf \"%.2f\" }} 倍于 CPU 核心数"
4.3 Grafana 面板 JSON
{
"dashboard": {
"title": "Linux Kernel Performance",
"uid": "linux-kernel-perf",
"version": 1,
"panels": [
{
"title": "CPU Usage Overview",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{ instance }} - Total"
},
{
"expr": "avg by(instance) (irate(node_cpu_seconds_total{mode=\"iowait\"}[5m])) * 100",
"legendFormat": "{{ instance }} - I/O Wait"
},
{
"expr": "avg by(instance) (irate(node_cpu_seconds_total{mode=\"system\"}[5m])) * 100",
"legendFormat": "{{ instance }} - System"
}
],
"yaxes": [
{"format": "percent", "max": 100, "min": 0},
{"format": "short"}
]
},
{
"title": "Memory Usage",
"type": "graph",
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"legendFormat": "{{ instance }} - Used %"
},
{
"expr": "(node_memory_SwapUsed_bytes / 1024 / 1024 / 1024)",
"legendFormat": "{{ instance }} - Swap GB"
}
],
"yaxes": [
{"format": "percent", "max": 100, "min": 0},
{"format": "decgbytes"}
]
},
{
"title": "Disk I/O",
"type": "graph",
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_disk_read_bytes_total[5m]) / 1024 / 1024",
"legendFormat": "{{ instance }} - {{ device }} Read MB/s"
},
{
"expr": "rate(node_disk_written_bytes_total[5m]) / 1024 / 1024",
"legendFormat": "{{ instance }} - {{ device }} Write MB/s"
}
],
"yaxes": [
{"format": "MBs"},
{"format": "short"}
]
},
{
"title": "Network Traffic",
"type": "graph",
"gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{device!~\"lo|veth.*\"}[5m]) / 1024 / 1024",
"legendFormat": "{{ instance }} - {{ device }} In MB/s"
},
{
"expr": "rate(node_network_transmit_bytes_total{device!~\"lo|veth.*\"}[5m]) / 1024 / 1024",
"legendFormat": "{{ instance }} - {{ device }} Out MB/s"
}
]
},
{
"title": "TCP Connection States",
"type": "graph",
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "node_netstat_Tcp_CurrEstab",
"legendFormat": "{{ instance }} - Established"
},
{
"expr": "node_netstat_Tcp_TimeWait",
"legendFormat": "{{ instance }} - TimeWait"
},
{
"expr": "node_netstat_Tcp_SynRecv",
"legendFormat": "{{ instance }} - SynRecv"
}
]
},
{
"title": "File Descriptors",
"type": "graph",
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "node_filefd_allocated",
"legendFormat": "{{ instance }} - Allocated"
},
{
"expr": "node_filefd_maximum",
"legendFormat": "{{ instance }} - Maximum"
}
]
},
{
"title": "Load Average",
"type": "graph",
"gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
"targets": [
{
"expr": "node_load1",
"legendFormat": "{{ instance }} - 1m"
},
{
"expr": "node_load5",
"legendFormat": "{{ instance }} - 5m"
},
{
"expr": "node_load15",
"legendFormat": "{{ instance }} - 15m"
}
]
},
{
"title": "System Load per Core",
"type": "gauge",
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
"targets": [
{
"expr": "node_load5 / count(node_cpu_seconds_total{mode=\"idle\"}) without(cpu, mode)",
"legendFormat": "{{ instance }}"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 1},
{"color": "red", "value": 2}
]
},
"max": 4,
"min": 0,
"unit": "short"
}
}
}
],
"refresh": "30s",
"schemaVersion": 38,
"time": {
"from": "now-6h",
"to": "now"
}
}
}
4.4 Alertmanager 配置
# ==============================================================================
# Alertmanager 配置
# ==============================================================================
global:
resolve_timeout: 5m
# 邮件配置
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alertmanager@example.com'
smtp_auth_username: 'alertmanager@example.com'
smtp_auth_password: 'your-password'
# 告警路由
route:
group_by: ['alertname', 'severity', 'instance']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'team-sre'
routes:
# 严重告警 -> 立即通知
- match:
severity: critical
receiver: 'team-sre-critical'
group_wait: 10s
repeat_interval: 1h
# 警告告警 -> 工作时间通知
- match:
severity: warning
receiver: 'team-sre-warning'
group_wait: 5m
repeat_interval: 8h
# 接收器配置
receivers:
- name: 'team-sre'
email_configs:
- to: 'sre-team@example.com'
send_resolved: true
- name: 'team-sre-critical'
email_configs:
- to: 'sre-team@example.com'
send_resolved: true
webhook_configs:
- url: 'http://webhook-server:5000/alerts'
send_resolved: true
# 钉钉/企业微信 webhook(示例)
# dingtalk_configs:
# - webhook_url: 'https://oapi.dingtalk.com/robot/send?access_token=xxx'
# message: '{{ template "dingtalk.message" . }}'
- name: 'team-sre-warning'
email_configs:
- to: 'sre-team@example.com'
send_resolved: true
# 告警抑制规则
inhibit_rules:
# 主机宕机时抑制该主机的所有告警
- source_match:
alertname: 'NodeDown'
target_match_re:
alertname: '.*'
equal: ['instance']
# CPU 严重告警抑制警告告警
- source_match:
severity: critical
target_match:
severity: warning
equal: ['alertname', 'instance']
# 告警模板
templates:
- '/etc/alertmanager/templates/*.tmpl'
5. 常用运维命令
5.1 CPU 性能分析
# ==============================================================================
# CPU 性能分析命令
# ==============================================================================
# 查看整体 CPU 使用情况
top -bn1 | head -20
# 实时 CPU 状态(更详细)
htop
# CPU 使用率统计
mpstat -P ALL 1 5
# 查看进程 CPU 使用
pidstat -p ALL 1 5
# 按进程排序 CPU 使用
ps -eo pid,ppid,user,%cpu,%mem,cmd --sort=-%cpu | head -20
# 查找高 CPU 进程
top -b -n 1 | awk 'NR>7 && $8>80 {print}'
# CPU 核心详情
lscpu
# CPU 频率信息
cat /proc/cpuinfo | grep "MHz"
# CPU 负载
cat /proc/loadavg
uptime
# 上下文切换统计
vmstat 1 10
# 中断统计
watch -n 1 cat /proc/interrupts
# 软中断统计
watch -n 1 cat /proc/softirqs
# CPU 调度延迟
perf sched latency
# CPU 火焰图生成
perf record -g -a -- sleep 30
perf script | stackcollapse-perf.pl | flamegraph.pl > cpu-flame.svg
# 查看进程线程
ps -eLf | grep <pid>
# 线程 CPU 使用
top -H -p <pid>
5.2 内存分析
# ==============================================================================
# 内存分析命令
# ==============================================================================
# 内存概况
free -h
# 详细内存信息
cat /proc/meminfo
# 内存使用统计
vmstat -s
# 按进程排序内存使用
ps -eo pid,ppid,user,%cpu,%mem,rss,cmd --sort=-%mem | head -20
# 进程内存映射
pmap -x <pid>
# 内存使用详情(按进程)
smem -t -k -s rss
# NUMA 内存分布
numactl --hardware
numastat
# 大页内存
grep Huge /proc/meminfo
cat /proc/vmstat | grep huge
# Slab 内存
slabtop -o
# 内存泄漏检测
valgrind --leak-check=full <command>
# 使用 gcore 捕获进程内存
gcore <pid>
# 分析 core 文件
gdb <binary> core.<pid>
# 查看共享内存
ipcs -m
ipcs -m -p
# 内存压缩统计
cat /proc/vmstat | grep compact
# 透明大页状态
cat /sys/kernel/mm/transparent_hugepage/enabled
5.3 磁盘 I/O 分析
# ==============================================================================
# 磁盘 I/O 分析命令
# ==============================================================================
# 磁盘 I/O 统计
iostat -x 1 10
# 磁盘使用情况
df -h
# 磁盘分区
lsblk
fdisk -l
# 查看块设备
ls -la /sys/block/
# 磁盘调度器
cat /sys/block/sda/queue/scheduler
# 磁盘队列深度
cat /sys/block/sda/queue/nr_requests
# 当前 I/O 进程
iotop -oP
# 按进程统计 I/O
pidstat -d 1 5
# 磁盘统计
cat /proc/diskstats
# 磁盘性能测试
fio --name=randread --ioengine=libaio --iodepth=16 --blocksize=4k --size=1G --rw=randread --filename=/tmp/testfile --direct=1
# 顺序读写测试
dd if=/dev/zero of=/tmp/testfile bs=1M count=1024 oflag=direct conv=fdatasync
# 查看 dmesg 磁盘错误
dmesg | grep -iE 'error|fail|ata|scsi'
# 文件系统信息
tune2fs -l /dev/sda1
# 文件系统检查(卸载后)
e2fsck -f /dev/sda1
# NFS 统计
nfsstat -c
5.4 网络分析
# ==============================================================================
# 网络分析命令
# ==============================================================================
# 网络接口统计
ip -s link show eth0
# 实时网络流量
iftop -i eth0
# 网络连接统计
netstat -s
# TCP 连接状态
ss -tunap
# 查看监听端口
ss -tlnp
# 网络连接统计(按状态)
netstat -nat | awk '{print $6}' | sort | uniq -c | sort -rn
# 抓包分析
tcpdump -i eth0 -nn port 80 -w capture.pcap
tcpdump -i eth0 -nn -X port 443
# 网络延迟
ping -c 10 target.com
# 路由追踪
traceroute target.com
mtr -r -c 10 target.com
# DNS 解析
dig target.com
nslookup target.com
# 网卡配置
ethtool eth0
ethtool -i eth0 # 驱动信息
ethtool -S eth0 # 统计信息
# 连接跟踪
cat /proc/net/nf_conntrack
cat /proc/sys/net/netfilter/nf_conntrack_count
cat /proc/sys/net/netfilter/nf_conntrack_max
# 网络命名空间
ip netns list
ip netns exec <nsname> ip addr
6. 故障排查案例
6.1 案例1:高 CPU 占用排查
# 场景:服务器 CPU 使用率持续 90%+
# 步骤1:确认高 CPU 进程
top -b -n 1 | head -20
ps aux --sort=-%cpu | head -10
# 步骤2:分析进程线程
pidstat -p <pid> -t 1 5
top -H -p <pid>
# 步骤3:使用 perf 分析
perf top -p <pid>
perf record -g -p <pid> -- sleep 30
perf report
# 步骤4:生成火焰图
perf script | stackcollapse-perf.pl | flamegraph.pl > cpu.svg
# 步骤5:strace 跟踪系统调用
strace -c -p <pid>
strace -T -p <pid> 2>&1 | head -100
# 常见原因:
# - 死循环:检查代码逻辑
# - 频繁 GC:调整 JVM 参数
# - 锁竞争:分析线程堆栈
# - 正则回溯:检查正则表达式
6.2 案例2:内存不足排查
# 场景:系统内存使用率 95%+,触发 OOM
# 步骤1:确认内存状态
free -h
cat /proc/meminfo | grep -E 'Mem|Swap|Cached|Buffers'
# 步骤2:找出内存占用进程
ps aux --sort=-%mem | head -10
smem -t -k -s rss | head -20
# 步骤3:检查 Slab 内存
slabtop -o
cat /proc/meminfo | grep Slab
# 步骤4:检查透明大页
cat /sys/kernel/mm/transparent_hugepage/enabled
# 步骤5:检查内存泄漏
valgrind --leak-check=full /path/to/binary
# 步骤6:分析 coredump
gdb /path/to/binary core.<pid>
(gdb) bt full
# 步骤7:临时缓解
echo 1 > /proc/sys/vm/drop_caches # 清理缓存
sysctl -w vm.swappiness=60 # 增加 swap 使用
# 常见原因:
# - 应用内存泄漏:修复代码或重启
# - 缓存过多:调整应用缓存策略
# - Slab 内存碎片:调整内核参数
# - 大页内存配置不当:检查 HugePages
6.3 案例3:磁盘 I/O 高延迟
# 场景:应用响应慢,I/O wait 高
# 步骤1:确认 I/O 状态
iostat -x 1 10
vmstat 1 10 | awk '{print $16}' # wa 列
# 步骤2:找出高 I/O 进程
iotop -oP
pidstat -d 1 10
# 步骤3:检查磁盘调度器
cat /sys/block/sda/queue/scheduler
# 步骤4:检查磁盘健康
smartctl -a /dev/sda
dmesg | grep -iE 'error|fail|ata|scsi'
# 步骤5:检查文件系统
df -h
tune2fs -l /dev/sda1 | grep -i state
# 步骤6:优化建议
# SSD: mq-deadline 或 none
echo mq-deadline > /sys/block/sda/queue/scheduler
# 增加队列深度
echo 256 > /sys/block/sda/queue/nr_requests
# 检查 RAID 状态
cat /proc/mdstat
megacli -AdpAllInfo -aAll
# 常见原因:
# - 调度器配置不当:更换调度器
# - 磁盘故障:更换硬盘
# - 磁盘碎片:整理或更换 SSD
# - 应用写入过多:优化应用逻辑
6.4 案例4:网络连接超时
# 场景:大量连接超时,服务不可用
# 步骤1:检查连接状态
ss -s
netstat -nat | awk '{print $6}' | sort | uniq -c
# 步骤2:检查连接跟踪
cat /proc/sys/net/netfilter/nf_conntrack_count
cat /proc/sys/net/netfilter/nf_conntrack_max
# 步骤3:检查半连接队列
netstat -nat | grep SYN_RECV | wc -l
cat /proc/sys/net/ipv4/tcp_max_syn_backlog
# 步骤4:检查全连接队列
ss -ltn
cat /proc/sys/net/core/somaxconn
# 步骤5:检查网卡状态
ethtool -S eth0 | grep -i error
ip -s link show eth0
# 步骤6:抓包分析
tcpdump -i eth0 -nn port 80 -w issue.pcap
wireshark issue.pcap
# 步骤7:优化配置
sysctl -w net.ipv4.tcp_max_syn_backlog=65535
sysctl -w net.core.somaxconn=65535
sysctl -w net.ipv4.tcp_tw_reuse=1
sysctl -w net.ipv4.tcp_fin_timeout=15
# 常见原因:
# - SYN Flood 攻击:开启 tcp_syncookies
# - 队列溢出:增加队列大小
# - TIME_WAIT 过多:复用 TIME_WAIT 连接
# - conntrack 表满:增加 nf_conntrack_max
7. 最佳实践
7.1 内核参数调优清单
| 场景 | 关键参数 | 建议值 |
|---|---|---|
| 高并发 Web | net.core.somaxconn | 65535 |
| 高并发 Web | net.ipv4.tcp_max_syn_backlog | 65535 |
| 高并发 Web | net.ipv4.tcp_tw_reuse | 1 |
| 数据库服务器 | vm.swappiness | 1-10 |
| 数据库服务器 | vm.dirty_ratio | 15 |
| 数据库服务器 | vm.dirty_background_ratio | 5 |
| Redis/Memory | vm.overcommit_memory | 1 |
| Elasticsearch | vm.max_map_count | 262144 |
| 容器宿主机 | fs.file-max | 2097152 |
| 容器宿主机 | fs.inotify.max_user_watches | 524288 |
| 网络密集型 | net.core.rmem_max | 16777216 |
| 网络密集型 | net.core.wmem_max | 16777216 |
7.2 监控指标优先级
| 优先级 | 指标 | 告警阈值 |
|---|---|---|
| P0 | CPU 使用率 | > 95% 持续 2 分钟 |
| P0 | 内存使用率 | > 95% 持续 2 分钟 |
| P0 | 磁盘使用率 | > 95% |
| P1 | I/O Wait | > 20% 持续 5 分钟 |
| P1 | 网络错误率 | > 100/s |
| P1 | TCP ListenDrops | > 0 |
| P2 | Load Average | > 2x CPU 核心数 |
| P2 | 文件描述符 | > 80% |
| P2 | Swap 使用 | > 50% |
7.3 定期维护任务
# 每日检查脚本
#!/bin/bash
# daily-check.sh
echo "=== 系统状态检查 ==="
echo "日期: $(date)"
echo
echo "--- CPU ---"
top -bn1 | head -5
echo
echo "--- 内存 ---"
free -h
echo
echo "--- 磁盘 ---"
df -h | grep -E '^/dev|^Filesystem'
echo
echo "--- 网络 ---"
ss -s
echo
echo "--- 内核日志 ---"
dmesg | tail -20
echo
echo "--- 登录用户 ---"
who
echo
echo "=== 检查完成 ==="
8. 参考资料
- Linux 内核文档: https://www.kernel.org/doc/Documentation/
- sysctl 参数说明: https://www.kernel.org/doc/Documentation/sysctl/
- Node Exporter 文档: https://github.com/prometheus/node_exporter
- perf 教程: https://perf.wiki.kernel.org/
- eBPF 工具: https://github.com/iovisor/bcc
文档版本: 1.0 更新日期: 2024-01-15 适用环境: Linux 内核 4.x/5.x,CentOS 7/8,Ubuntu 18.04/20.04/22.04