SRE 每日主题:Kubernetes 生产运维与故障排查

日期: 2026-03-18
主题序号: 0 (0 % 12 = 0)
难度等级: ⭐⭐⭐⭐⭐
适用场景: 生产环境 Kubernetes 集群运维


一、生产环境部署架构

1.1 高可用集群架构(ASCII 图)

                            ┌─────────────────────────────────────────────────────────────┐
                            │                      外部负载均衡器                          │
                            │                   (API Server LB VIP)                       │
                            └───────────────────────────┬─────────────────────────────────┘
                                                        │
                    ┌───────────────────────────────────┼───────────────────────────────────┐
                    │                                   │                                   │
                    ▼                                   ▼                                   ▼
         ┌──────────────────┐               ┌──────────────────┐               ┌──────────────────┐
         │   Master Node 1  │               │   Master Node 2  │               │   Master Node 3  │
         │  ┌────────────┐  │               │  ┌────────────┐  │               │  ┌────────────┐  │
         │  │ API Server │  │               │  │ API Server │  │               │  │ API Server │  │
         │  ├────────────┤  │               │  ├────────────┤  │               │  ├────────────┤  │
         │  │ Scheduler  │  │               │  │ Scheduler  │  │               │  │ Scheduler  │  │
         │  ├────────────┤  │               │  ├────────────┤  │               │  ├────────────┤  │
         │  │Controller  │  │               │  │Controller  │  │               │  │Controller  │  │
         │  │  Manager   │  │               │  │  Manager   │  │               │  │  Manager   │  │
         │  ├────────────┤  │               │  ├────────────┤  │               │  ├────────────┤  │
         │  │   etcd     │◄─┼───────────────┼─►│   etcd     │◄─┼───────────────┼─►│   etcd     │  │
         │  └────────────┘  │               │  └────────────┘  │               │  └────────────┘  │
         └──────────────────┘               └──────────────────┘               └──────────────────┘
                    │                                   │                                   │
                    └───────────────────────────────────┼───────────────────────────────────┘
                                                        │
                    ┌───────────────────────────────────┼───────────────────────────────────┐
                    │                                   │                                   │
                    ▼                                   ▼                                   ▼
         ┌──────────────────┐               ┌──────────────────┐               ┌──────────────────┐
         │  Worker Node 1   │               │  Worker Node 2   │               │  Worker Node 3   │
         │  ┌────────────┐  │               │  ┌────────────┐  │               │  ┌────────────┐  │
         │  │   kubelet  │  │               │  │   kubelet  │  │               │  │   kubelet  │  │
         │  ├────────────┤  │               │  ├────────────┤  │               │  ├────────────┤  │
         │  │kube-proxy  │  │               │  │kube-proxy  │  │               │  │kube-proxy  │  │
         │  ├────────────┤  │               │  ├────────────┤  │               │  ├────────────┤  │
         │  │  Container │  │               │  │  Container │  │               │  │  Container │  │
         │  │   Runtime  │  │               │  │   Runtime  │  │               │  │   Runtime  │  │
         │  └────────────┘  │               │  └────────────┘  │               │  └────────────┘  │
         └──────────────────┘               └──────────────────┘               └──────────────────┘
                    │                                   │                                   │
                    └───────────────────────────────────┼───────────────────────────────────┘
                                                        │
                                                        ▼
                            ┌─────────────────────────────────────────────────────────────┐
                            │                    网络插件 (CNI)                            │
                            │              Calico / Cilium / Flannel                      │
                            └─────────────────────────────────────────────────────────────┘

1.2 推荐架构(生产环境多集群)

┌─────────────────────────────────────────────────────────────────────────────┐
│                            Ingress 层                                        │
│    ┌─────────────┐     ┌─────────────┐     ┌─────────────┐                   │
│    │  Nginx      │     │  Traefik    │     │  Kong       │                   │
│    │  Ingress    │     │  Ingress    │     │  Gateway    │                   │
│    └─────────────┘     └─────────────┘     └─────────────┘                   │
└─────────────────────────────────────────────────────────────────────────────┘
                                    │
        ┌───────────────────────────┼───────────────────────────┐
        │                           │                           │
        ▼                           ▼                           ▼
┌───────────────────┐     ┌───────────────────┐     ┌───────────────────┐
│  集群 cn-east-1   │     │  集群 cn-north-1  │     │  集群 disaster-    │
│  (生产主集群)      │     │  (生产备集群)      │     │  recovery (灾备)   │
│                   │     │                   │     │                   │
│  ┌─────────────┐  │     │  ┌─────────────┐  │     │  ┌─────────────┐  │
│  │ Master x3   │  │     │  │ Master x3   │  │     │  │ Master x3   │  │
│  └─────────────┘  │     │  └─────────────┘  │     │  └─────────────┘  │
│  ┌─────────────┐  │     │  ┌─────────────┐  │     │  ┌─────────────┐  │
│  │ Worker x10+ │  │     │  │ Worker x10+ │  │     │  │ Worker x5   │  │
│  └─────────────┘  │     │  └─────────────┘  │     │  └─────────────┘  │
│                   │     │                   │     │                   │
│  存储: Ceph/GlusterFS  │  存储: Ceph/GlusterFS  │  存储: 远程备份      │
└───────────────────┘     └───────────────────┘     └───────────────────┘
        │                           │                           │
        └───────────────────────────┼───────────────────────────┘
                                    │
                                    ▼
                        ┌─────────────────────┐
                        │   全局服务网格       │
                        │   Istio / Linkerd   │
                        └─────────────────────┘

1.3 K8s 集群部署配置 YAML

1.3.1 Namespace 配置

# namespaces/base-namespaces.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: production
  labels:
    name: production
    environment: prod
    pod-security.kubernetes.io/enforce: restricted
    pod-security.kubernetes.io/enforce-version: latest
---
apiVersion: v1
kind: Namespace
metadata:
  name: staging
  labels:
    name: staging
    environment: staging
    pod-security.kubernetes.io/enforce: baseline
---
apiVersion: v1
kind: Namespace
metadata:
  name: monitoring
  labels:
    name: monitoring
    pod-security.kubernetes.io/enforce: privileged
---
apiVersion: v1
kind: Namespace
metadata:
  name: kube-system
  labels:
    name: kube-system
    pod-security.kubernetes.io/enforce: privileged

1.3.2 生产级 Deployment 配置

# deployments/production-app.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: api-server
  namespace: production
  labels:
    app: api-server
    version: v1.0.0
spec:
  replicas: 3
  revisionHistoryLimit: 10
  selector:
    matchLabels:
      app: api-server
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 1           # 滚动更新时最多多启动 1 个 Pod
      maxUnavailable: 0     # 滚动更新时保证最少可用 Pod 数
  template:
    metadata:
      labels:
        app: api-server
        version: v1.0.0
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics"
    spec:
      # 服务账户配置
      serviceAccountName: api-server-sa

      # 安全上下文(Pod 级别)
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
        runAsGroup: 1000
        fsGroup: 1000
        seccompProfile:
          type: RuntimeDefault

      # 优先级配置
      priorityClassName: high-priority

      # 反亲和性(确保 Pod 分散在不同节点)
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: app
                      operator: In
                      values:
                        - api-server
                topologyKey: kubernetes.io/hostname
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
              - matchExpressions:
                  - key: node-role.kubernetes.io/worker
                    operator: Exists
                  - key: node.kubernetes.io/instance-type
                    operator: In
                    values:
                      - compute-optimized

      # 容忍度配置
      tolerations:
        - key: "dedicated"
          operator: "Equal"
          value: "production"
          effect: "NoSchedule"

      # 拓扑分布约束(多可用区部署)
      topologySpreadConstraints:
        - maxSkew: 1
          topologyKey: topology.kubernetes.io/zone
          whenUnsatisfiable: DoNotSchedule
          labelSelector:
            matchLabels:
              app: api-server

      # 初始化容器
      initContainers:
        - name: init-permissions
          image: busybox:1.36
          command: ['sh', '-c', 'chmod -R 755 /data && chown -R 1000:1000 /data']
          volumeMounts:
            - name: data-volume
              mountPath: /data

      containers:
        - name: api-server
          image: registry.example.com/api-server:v1.0.0
          imagePullPolicy: IfNotPresent

          # 端口配置
          ports:
            - name: http
              containerPort: 8080
              protocol: TCP
            - name: metrics
              containerPort: 8081
              protocol: TCP

          # 环境变量(从 ConfigMap 和 Secret 引用)
          env:
            - name: NODE_NAME
              valueFrom:
                fieldRef:
                  fieldPath: spec.nodeName
            - name: POD_NAME
              valueFrom:
                fieldRef:
                  fieldPath: metadata.name
            - name: POD_NAMESPACE
              valueFrom:
                fieldRef:
                  fieldPath: metadata.namespace
            - name: POD_IP
              valueFrom:
                fieldRef:
                  fieldPath: status.podIP
            - name: APP_ENV
              valueFrom:
                configMapKeyRef:
                  name: app-config
                  key: environment
            - name: DB_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: db-credentials
                  key: password

          # 资源限制(必须配置)
          resources:
            requests:
              cpu: "500m"
              memory: "512Mi"
              ephemeral-storage: "1Gi"
            limits:
              cpu: "2000m"
              memory: "2Gi"
              ephemeral-storage: "10Gi"

          # 安全上下文(容器级别)
          securityContext:
            allowPrivilegeEscalation: false
            readOnlyRootFilesystem: true
            capabilities:
              drop:
                - ALL

          # 存活探针(Liveness Probe)
          livenessProbe:
            httpGet:
              path: /health/live
              port: http
            initialDelaySeconds: 30
            periodSeconds: 10
            timeoutSeconds: 5
            failureThreshold: 3
            successThreshold: 1

          # 就绪探针(Readiness Probe)
          readinessProbe:
            httpGet:
              path: /health/ready
              port: http
            initialDelaySeconds: 5
            periodSeconds: 5
            timeoutSeconds: 3
            failureThreshold: 3
            successThreshold: 1

          # 启动探针(Startup Probe)
          startupProbe:
            httpGet:
              path: /health/startup
              port: http
            initialDelaySeconds: 0
            periodSeconds: 5
            timeoutSeconds: 3
            failureThreshold: 30
            successThreshold: 1

          # 卷挂载
          volumeMounts:
            - name: config-volume
              mountPath: /etc/app/config
              readOnly: true
            - name: data-volume
              mountPath: /data
            - name: tmp-volume
              mountPath: /tmp
            - name: cache-volume
              mountPath: /var/cache

      # 卷定义
      volumes:
        - name: config-volume
          configMap:
            name: app-config
            defaultMode: 0444
        - name: data-volume
          persistentVolumeClaim:
            claimName: api-server-pvc
        - name: tmp-volume
          emptyDir:
            sizeLimit: 500Mi
        - name: cache-volume
          emptyDir:
            sizeLimit: 1Gi

      # 终止宽限期
      terminationGracePeriodSeconds: 60

      # DNS 配置
      dnsPolicy: ClusterFirst
      dnsConfig:
        options:
          - name: ndots
            value: "2"
          - name: timeout
            value: "3"
          - name: attempts
            value: "3"

1.3.3 Service 配置

# services/api-server-service.yaml
apiVersion: v1
kind: Service
metadata:
  name: api-server
  namespace: production
  labels:
    app: api-server
  annotations:
    # 负载均衡器配置
    service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
    service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: "true"
    # Prometheus 监控
    prometheus.io/scrape: "true"
    prometheus.io/port: "8081"
spec:
  type: LoadBalancer
  externalTrafficPolicy: Local  # 保留客户端源 IP
  sessionAffinity: None
  selector:
    app: api-server
  ports:
    - name: http
      port: 80
      targetPort: http
      protocol: TCP
    - name: metrics
      port: 8081
      targetPort: metrics
      protocol: TCP
---
# 内部服务(ClusterIP)
apiVersion: v1
kind: Service
metadata:
  name: api-server-internal
  namespace: production
  labels:
    app: api-server
spec:
  type: ClusterIP
  selector:
    app: api-server
  ports:
    - name: http
      port: 8080
      targetPort: http
      protocol: TCP
---
# Headless Service(用于 StatefulSet)
apiVersion: v1
kind: Service
metadata:
  name: api-server-headless
  namespace: production
  labels:
    app: api-server
spec:
  type: ClusterIP
  clusterIP: None
  selector:
    app: api-server
  ports:
    - name: http
      port: 8080
      targetPort: http
      protocol: TCP

1.3.4 Ingress 配置

# ingress/api-server-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: api-server-ingress
  namespace: production
  annotations:
    # Nginx Ingress 配置
    nginx.ingress.kubernetes.io/ssl-redirect: "true"
    nginx.ingress.kubernetes.io/proxy-body-size: "100m"
    nginx.ingress.kubernetes.io/proxy-connect-timeout: "30"
    nginx.ingress.kubernetes.io/proxy-read-timeout: "300"
    nginx.ingress.kubernetes.io/proxy-send-timeout: "300"
    nginx.ingress.kubernetes.io/proxy-buffering: "on"
    nginx.ingress.kubernetes.io/proxy-buffer-size: "128k"
    nginx.ingress.kubernetes.io/proxy-buffers-number: "4"
    # 限流配置
    nginx.ingress.kubernetes.io/limit-connections: "100"
    nginx.ingress.kubernetes.io/limit-rps: "50"
    # 安全头配置
    nginx.ingress.kubernetes.io/configuration-snippet: |
      add_header X-Frame-Options "SAMEORIGIN" always;
      add_header X-Content-Type-Options "nosniff" always;
      add_header X-XSS-Protection "1; mode=block" always;
      add_header Referrer-Policy "strict-origin-when-cross-origin" always;
    # 证书配置
    cert-manager.io/cluster-issuer: "letsencrypt-prod"
    cert-manager.io/acme-challenge-type: "http01"
spec:
  ingressClassName: nginx
  tls:
    - hosts:
        - api.example.com
      secretName: api-server-tls
  rules:
    - host: api.example.com
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: api-server-internal
                port:
                  number: 8080

1.3.5 ConfigMap 配置

# configmaps/app-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: app-config
  namespace: production
data:
  # 应用配置
  environment: "production"
  log_level: "info"
  app_name: "api-server"

  # 数据库配置
  db_host: "postgres.production.svc.cluster.local"
  db_port: "5432"
  db_name: "production_db"
  db_max_connections: "100"
  db_connection_timeout: "30"

  # Redis 配置
  redis_host: "redis.production.svc.cluster.local"
  redis_port: "6379"
  redis_db: "0"

  # 服务发现配置
  service_discovery_enabled: "true"

  # 配置文件示例
  application.yaml: |
    server:
      port: 8080
      shutdown: graceful
      compression:
        enabled: true
        mime-types: application/json,text/html,text/xml,text/plain

    spring:
      datasource:
        hikari:
          maximum-pool-size: 50
          minimum-idle: 10
          idle-timeout: 300000
          connection-timeout: 30000
          max-lifetime: 1200000

      redis:
        lettuce:
          pool:
            max-active: 20
            max-idle: 10
            min-idle: 5
            max-wait: 30000ms

    management:
      endpoints:
        web:
          exposure:
            include: health,info,metrics,prometheus
      metrics:
        tags:
          application: ${APP_NAME:api-server}
          environment: ${APP_ENV:production}

    logging:
      level:
        root: INFO
        com.example: DEBUG
      pattern:
        console: "%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n"
---
# 不可变 ConfigMap(推荐用于生产)
apiVersion: v1
kind: ConfigMap
metadata:
  name: app-config-immutable
  namespace: production
immutable: true
data:
  feature_flags: |
    enable_cache: true
    enable_rate_limit: true
    enable_tracing: true

1.3.6 Secret 配置

# secrets/db-credentials.yaml
apiVersion: v1
kind: Secret
metadata:
  name: db-credentials
  namespace: production
type: Opaque
stringData:
  username: "app_user"
  password: "change-me-in-production"  # 生产环境使用 Sealed Secrets 或 Vault
  host: "postgres.production.svc.cluster.local"
  port: "5432"
  database: "production_db"
  connection_string: "postgresql://app_user:change-me-in-production@postgres.production.svc.cluster.local:5432/production_db"
---
# TLS Secret
apiVersion: v1
kind: Secret
metadata:
  name: api-server-tls
  namespace: production
type: kubernetes.io/tls
stringData:
  # 生产环境使用 cert-manager 自动管理
  tls.crt: |
    -----BEGIN CERTIFICATE-----
    MIICxjCCAa4CCQDv...
    -----END CERTIFICATE-----
  tls.key: |
    -----BEGIN PRIVATE KEY-----
    MIIEvgIBADANBgkq...
    -----END PRIVATE KEY-----

二、关键参数调优

2.1 kube-apiserver 配置详解

# /etc/kubernetes/manifests/kube-apiserver.yaml
apiVersion: v1
kind: Pod
metadata:
  name: kube-apiserver
  namespace: kube-system
spec:
  containers:
    - name: kube-apiserver
      image: k8s.gcr.io/kube-apiserver:v1.28.0
      command:
        - kube-apiserver
        # ========== 基础配置 ==========
        - --advertise-address=0.0.0.0
        - --bind-address=0.0.0.0
        - --secure-port=6443
        - --port=0  # 禁用非安全端口

        # ========== etcd 配置 ==========
        - --etcd-servers=https://etcd-01:2379,https://etcd-02:2379,https://etcd-03:2379
        - --etcd-cafile=/etc/kubernetes/pki/etcd/ca.crt
        - --etcd-certfile=/etc/kubernetes/pki/apiserver-etcd-client.crt
        - --etcd-keyfile=/etc/kubernetes/pki/apiserver-etcd-client.key

        # ========== 认证配置 ==========
        - --client-ca-file=/etc/kubernetes/pki/ca.crt
        - --tls-cert-file=/etc/kubernetes/pki/apiserver.crt
        - --tls-private-key-file=/etc/kubernetes/pki/apiserver.key
        - --service-account-issuer=https://kubernetes.default.svc.cluster.local
        - --service-account-key-file=/etc/kubernetes/pki/sa.pub
        - --service-account-signing-key-file=/etc/kubernetes/pki/sa.key
        - --service-account-lookup=true
        - --anonymous-auth=false

        # ========== 授权配置 ==========
        - --authorization-mode=Node,RBAC
        - --enable-admission-plugins=NodeRestriction,PodSecurityPolicy,LimitRanger,ServiceAccount,DefaultStorageClass,ResourceQuota,Priority,MutatingAdmissionWebhook,ValidatingAdmissionWebhook

        # ========== 资源控制 ==========
        - --max-requests-inflight=800           # 最大并发请求数
        - --max-mutating-requests-inflight=400  # 最大并发变更请求数
        - --request-timeout=60s                  # 请求超时时间
        - --min-request-timeout=1800            # watch 请求最小超时

        # ========== 日志审计 ==========
        - --audit-log-maxage=30
        - --audit-log-maxbackup=10
        - --audit-log-maxsize=100
        - --audit-log-path=/var/log/kubernetes/audit.log
        - --audit-policy-file=/etc/kubernetes/audit-policy.yaml

        # ========== 事件配额 ==========
        - --event-ttl=2h                         # 事件保留时间

        # ========== 特性门控 ==========
        - --feature-gates=RotateKubeletServerCertificate=true,CSIStorageCapacity=true,EphemeralContainers=true

        # ========== 安全配置 ==========
        - --enable-bootstrap-token-auth=true
        - --tls-min-version=VersionTLS12
        - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384

      resources:
        requests:
          cpu: 250m
          memory: 512Mi
        limits:
          cpu: "4"
          memory: 4Gi

      volumeMounts:
        - name: kubernetes-config
          mountPath: /etc/kubernetes
          readOnly: true
        - name: var-log-kubernetes
          mountPath: /var/log/kubernetes

  volumes:
    - name: kubernetes-config
      hostPath:
        path: /etc/kubernetes
        type: Directory
    - name: var-log-kubernetes
      hostPath:
        path: /var/log/kubernetes
        type: DirectoryOrCreate

2.2 kube-controller-manager 配置详解

# /etc/kubernetes/manifests/kube-controller-manager.yaml
apiVersion: v1
kind: Pod
metadata:
  name: kube-controller-manager
  namespace: kube-system
spec:
  containers:
    - name: kube-controller-manager
      image: k8s.gcr.io/kube-controller-manager:v1.28.0
      command:
        - kube-controller-manager
        # ========== 基础配置 ==========
        - --bind-address=0.0.0.0
        - --port=0
        - --secure-port=10257

        # ========== 集群配置 ==========
        - --cluster-name=kubernetes
        - --cluster-cidr=10.244.0.0/16
        - --service-cluster-ip-range=10.96.0.0/12
        - --allocate-node-cidrs=true

        # ========== 认证配置 ==========
        - --authentication-kubeconfig=/etc/kubernetes/controller-manager.conf
        - --authorization-kubeconfig=/etc/kubernetes/controller-manager.conf
        - --kubeconfig=/etc/kubernetes/controller-manager.conf
        - --use-service-account-credentials=true
        - --service-account-private-key-file=/etc/kubernetes/pki/sa.key
        - --root-ca-file=/etc/kubernetes/pki/ca.crt

        # ========== 资源控制 ==========
        - --node-cidr-mask-size=24
        - --cluster-signing-cert-file=/etc/kubernetes/pki/ca.crt
        - --cluster-signing-key-file=/etc/kubernetes/pki/ca.key
        - --cluster-signing-duration=8760h0m0s  # 证书有效期 1 年

        # ========== 控制器配置 ==========
        - --controllers=*,bootstrapsigner,tokencleaner
        - --leader-elect=true
        - --leader-elect-lease-duration=15s
        - --leader-elect-renew-deadline=10s
        - --leader-elect-resource-lock=leases
        - --leader-elect-retry-period=2s

        # ========== 并发控制 ==========
        - --concurrent-deployment-syncs=5
        - --concurrent-replicaset-syncs=5
        - --concurrent-statefulset-syncs=5
        - --concurrent-daemonset-syncs=2
        - --concurrent-job-syncs=5
        - --concurrent-gc-syncs=20

        # ========== Pod 驱逐配置 ==========
        - --pod-eviction-timeout=5m0s
        - --node-monitor-grace-period=40s
        - --node-monitor-period=5s

        # ========== 特性门控 ==========
        - --feature-gates=RotateKubeletServerCertificate=true

      resources:
        requests:
          cpu: 200m
          memory: 256Mi
        limits:
          cpu: "2"
          memory: 2Gi

2.3 kube-scheduler 配置详解

# /etc/kubernetes/manifests/kube-scheduler.yaml
apiVersion: v1
kind: Pod
metadata:
  name: kube-scheduler
  namespace: kube-system
spec:
  containers:
    - name: kube-scheduler
      image: k8s.gcr.io/kube-scheduler:v1.28.0
      command:
        - kube-scheduler
        # ========== 基础配置 ==========
        - --bind-address=0.0.0.0
        - --port=0
        - --secure-port=10259

        # ========== 认证配置 ==========
        - --authentication-kubeconfig=/etc/kubernetes/scheduler.conf
        - --authorization-kubeconfig=/etc/kubernetes/scheduler.conf
        - --kubeconfig=/etc/kubernetes/scheduler.conf

        # ========== 选举配置 ==========
        - --leader-elect=true
        - --leader-elect-lease-duration=15s
        - --leader-elect-renew-deadline=10s
        - --leader-elect-resource-lock=leases
        - --leader-elect-retry-period=2s

        # ========== 调度配置 ==========
        - --algorithm-provider=Default
        - --profiling=false

        # ========== 自定义调度策略 ==========
        - --policy-config-file=/etc/kubernetes/scheduler-policy.yaml
        - --config=/etc/kubernetes/scheduler-config.yaml

      resources:
        requests:
          cpu: 100m
          memory: 128Mi
        limits:
          cpu: "1"
          memory: 1Gi

2.4 kubelet 配置详解

# /var/lib/kubelet/config.yaml
apiVersion: kubelet.config.k8s.io/v1beta1
kind: KubeletConfiguration

# ========== 基础配置 ==========
address: 0.0.0.0
port: 10250
readOnlyPort: 0
authentication:
  anonymous:
    enabled: false
  webhook:
    enabled: true
  x509:
    clientCAFile: /etc/kubernetes/pki/ca.crt
authorization:
  mode: Webhook

# ========== 认证配置 ==========
serverTLSBootstrap: true
rotateCertificates: true
rotateServerCertificates: true

# ========== 资源预留 ==========
systemReserved:
  cpu: "500m"
  memory: "1Gi"
  ephemeral-storage: "10Gi"
kubeReserved:
  cpu: "500m"
  memory: "1Gi"
  ephemeral-storage: "10Gi"
systemReservedCgroup: /system.slice
kubeReservedCgroup: /kubelet.slice
enforceNodeAllocatable:
  - pods
  - system-reserved
  - kube-reserved

# ========== 驱逐配置 ==========
evictionHard:
  memory.available: "500Mi"
  nodefs.available: "10%"
  nodefs.inodesFree: "5%"
  imagefs.available: "15%"
evictionSoft:
  memory.available: "1Gi"
  nodefs.available: "15%"
  imagefs.available: "20%"
evictionSoftGracePeriod:
  memory.available: "1m30s"
  nodefs.available: "2m"
  imagefs.available: "2m"
evictionMaxPodGracePeriod: 60
evictionMinimumReclaim:
  memory.available: "0Mi"
  nodefs.available: "500Mi"
  imagefs.available: "2Gi"
evictionPressureTransitionPeriod: 5m

# ========== 镜像与容器 GC ==========
imageMinimumGCAge: 2m
imageGCHighThresholdPercent: 85
imageGCLowThresholdPercent: 80
maximumDeadContainersPerContainer: 1
minimumContainerTTLDuration: "1m"

# ========== Pod 配置 ==========
maxPods: 110
podPidsLimit: 4096
podsPerCore: 0  # 0 表示不限制
enableControllerAttachDetach: true

# ========== 网络配置 ==========
clusterDomain: cluster.local
clusterDNS:
  - 10.96.0.10
hairpinMode: hairpin-veth
hairpinMode: promiscuous-bridge

# ========== Cgroups 配置 ==========
cgroupDriver: systemd
cgroupsPerQOS: true
cgroupRoot: /

# ========== 特性门控 ==========
featureGates:
  RotateKubeletServerCertificate: true
  KubeletInUserNamespace: false
  MemoryQoS: true
  CPUManagerPolicyOptions: true
  CPUManagerPolicyAlphaOptions: true

# ========== CPU Manager 配置 ==========
cpuManagerPolicy: static
cpuManagerReconcilePeriod: 10s
topologyManagerPolicy: best-effort

# ========== 日志配置 ==========
logging:
  format: json
  flushFrequency: 5s
  verbosity: 2

# ========== 证书轮转 ==========
certificatesDir: /var/lib/kubelet/pki

# ========== 运行时配置 ==========
containerRuntimeEndpoint: unix:///run/containerd/containerd.sock

2.5 etcd 配置详解

# /etc/etcd/etcd.conf
# ========== 成员配置 ==========
ETCD_NAME=etcd-node-01
ETCD_DATA_DIR=/var/lib/etcd
ETCD_WAL_DIR=/var/lib/etcd/wal
ETCD_LISTEN_PEER_URLS=https://10.0.1.10:2380
ETCD_LISTEN_CLIENT_URLS=https://10.0.1.10:2379,http://127.0.0.1:2379

# ========== 集群配置 ==========
ETCD_INITIAL_ADVERTISE_PEER_URLS=https://10.0.1.10:2380
ETCD_ADVERTISE_CLIENT_URLS=https://10.0.1.10:2379
ETCD_INITIAL_CLUSTER=etcd-node-01=https://10.0.1.10:2380,etcd-node-02=https://10.0.1.11:2380,etcd-node-03=https://10.0.1.12:2380
ETCD_INITIAL_CLUSTER_STATE=new
ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster-token

# ========== 安全配置 ==========
ETCD_CLIENT_CERT_AUTH=true
ETCD_TRUSTED_CA_FILE=/etc/etcd/pki/ca.crt
ETCD_CERT_FILE=/etc/etcd/pki/server.crt
ETCD_KEY_FILE=/etc/etcd/pki/server.key
ETCD_PEER_CLIENT_CERT_AUTH=true
ETCD_PEER_TRUSTED_CA_FILE=/etc/etcd/pki/ca.crt
ETCD_PEER_CERT_FILE=/etc/etcd/pki/peer.crt
ETCD_PEER_KEY_FILE=/etc/etcd/pki/peer.key

# ========== 性能调优 ==========
# 心跳间隔(默认 100ms,建议网络延迟<10ms 时保持默认)
ETCD_HEARTBEAT_INTERVAL=100

# 选举超时(默认 1000ms,建议为心跳间隔的 10 倍)
ETCD_ELECTION_TIMEOUT=1000

# 快照事务数(默认 100000,增大可减少快照频率)
ETCD_SNAPSHOT_COUNT=10000

# 自动压缩模式(periodic 周期压缩,revision 版本压缩)
ETCD_AUTO_COMPACTION_MODE=periodic

# 自动压缩保留时间
ETCD_AUTO_COMPACTION_RETENTION=1h

# 配额大小(默认 2GB,最大 8GB)
ETCD_QUOTA_BACKEND_BYTES=8589934592

# 最大请求大小(默认 1.5MB)
ETCD_MAX_REQUEST_BYTES=15728640

# ========== 日志配置 ==========
ETCD_LOG_LEVEL=info
ETCD_LOG_PACKAGE_LEVELS=etcdserver=info,raft=info

# ========== 快照备份配置 ==========
ETCD_ENABLE_V2=false

三、系统内核/OS 层优化

3.1 系统内核参数调优

# /etc/sysctl.d/99-kubernetes.conf
# ========== 网络优化 ==========

# 连接追踪表大小(高并发场景必须调大)
net.netfilter.nf_conntrack_max=1048576
net.nf_conntrack_max=1048576

# TCP 连接优化
net.ipv4.tcp_fin_timeout=30
net.ipv4.tcp_keepalive_time=600
net.ipv4.tcp_keepalive_probes=3
net.ipv4.tcp_keepalive_intvl=15
net.ipv4.tcp_max_syn_backlog=16384
net.ipv4.tcp_max_tw_buckets=65536
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_syncookies=1
net.ipv4.tcp_syn_retries=2
net.ipv4.tcp_synack_retries=2
net.ipv4.tcp_max_orphans=32768
net.ipv4.tcp_retries2=5
net.ipv4.tcp_sack=1
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_rmem=4096 87380 16777216
net.ipv4.tcp_wmem=4096 65536 16777216
net.core.somaxconn=32768
net.core.netdev_max_backlog=16384
net.core.rmem_max=16777216
net.core.wmem_max=16777216
net.core.rmem_default=262144
net.core.wmem_default=262144
net.core.optmem_max=16777216

# 本地端口范围
net.ipv4.ip_local_port_range=1024 65535

# 转发设置(必须开启)
net.ipv4.ip_forward=1
net.ipv6.conf.all.forwarding=1

# ========== 内存优化 ==========

# 内存过量使用
vm.overcommit_memory=1

# 最大内存映射数量(ES、MongoDB 等需要)
vm.max_map_count=262144

# swap 使用倾向(尽量不使用 swap)
vm.swappiness=0

# 脏页比例
vm.dirty_ratio=20
vm.dirty_background_ratio=5
vm.dirty_expire_centisecs=3000
vm.dirty_writeback_centisecs=500

# 最小空闲内存(字节)
vm.min_free_kbytes=65536

# ========== 文件系统优化 ==========

# 文件描述符上限
fs.file-max=2097152

# inode 数量
fs.inotify.max_user_instances=8192
fs.inotify.max_user_watches=524288
fs.inotify.max_queued_events=32768

# AIO 异步 IO
fs.aio-max-nr=1048576

# ========== 安全优化 ==========

# 禁止 IPv6
net.ipv6.conf.all.disable_ipv6=1
net.ipv6.conf.default.disable_ipv6=1
net.ipv6.conf.lo.disable_ipv6=1

# 防止 SYN 洪水攻击
net.ipv4.tcp_syncookies=1

# 禁用 ICMP 重定向
net.ipv4.conf.all.accept_redirects=0
net.ipv4.conf.default.accept_redirects=0
net.ipv4.conf.all.send_redirects=0
net.ipv4.conf.default.send_redirects=0

# 禁用源路由
net.ipv4.conf.all.accept_source_route=0
net.ipv4.conf.default.accept_source_route=0

# 反向路径过滤
net.ipv4.conf.all.rp_filter=1
net.ipv4.conf.default.rp_filter=1

# ========== 其他优化 ==========

# 用户最大进程数
kernel.pid_max=4194303

# 用户最大线程数
kernel.threads-max=2097152

# 内核随机数池大小
kernel.random.poolsize=4096

# 消息队列
kernel.msgmax=65536
kernel.msgmnb=65536
kernel.msgmni=4096
kernel.shmmax=68719476736
kernel.shmall=4294967296

3.2 系统资源限制配置

# /etc/security/limits.d/99-kubernetes.conf
# ========== Kubernetes 服务账户限制 ==========

# 文件描述符
kubernetes    soft    nofile    1048576
kubernetes    hard    nofile    1048576

# 进程数
kubernetes    soft    nproc     unlimited
kubernetes    hard    nproc     unlimited

# 内存锁定(etcd、ES 等需要)
kubernetes    soft    memlock   unlimited
kubernetes    hard    memlock   unlimited

# 核心转储大小
kubernetes    soft    core      unlimited
kubernetes    hard    core      unlimited

# ========== Docker/Container Runtime 限制 ==========

# 文件描述符
docker        soft    nofile    1048576
docker        hard    nofile    1048576

# 进程数
docker        soft    nproc     unlimited
docker        hard    nproc     unlimited

# ========== 系统级限制(/etc/security/limits.conf) ==========

# Root 用户
root          soft    nofile    1048576
root          hard    nofile    1048576
root          soft    nproc     unlimited
root          hard    nproc     unlimited

# 所有用户
*             soft    nofile    1048576
*             hard    nofile    1048576
*             soft    nproc     65535
*             hard    nproc     65535
*             soft    stack     8192
*             hard    stack     8192

3.3 Systemd 服务优化

# /etc/systemd/system/kubelet.service.d/10-kubelet.conf
[Service]
# 资源限制
LimitNOFILE=1048576
LimitNPROC=unlimited
LimitCORE=infinity
LimitMEMLOCK=infinity

# CPU 权重
CPUWeight=90

# IO 权重
IOWeight=90

# 内存限制
MemoryMax=infinity

# 重启策略
Restart=always
RestartSec=10s

# 启动超时
TimeoutStartSec=300
TimeoutStopSec=300

# 环境变量
Environment="KUBELET_EXTRA_ARGS=--v=2"

# OOM 评分调整(-1000 到 1000,越小越不容易被杀)
OOMScoreAdjust=-999

3.4 Containerd 配置优化

# /etc/containerd/config.toml
version = 2

[plugins."io.containerd.grpc.v1.cri"]
  # ========== 沙箱配置 ==========
  sandbox_image = "registry.k8s.io/pause:3.9"

  # ========== 容器统计信息 ==========
  disable_tcp_service_dns = false
  stream_server_address = "127.0.0.1"
  stream_server_port = "0"
  enable_selinux = false
  selinux_category_range = 1024
  max_container_log_line_size = 16384
  disable_cgroup = false
  disable_apparmor = false
  restrict_oom_score_adj = false
  max_concurrent_downloads = 10
  disable_proc_mount = false
  containerd_image_unpacked_pause_sync = false
  ignore_image_defined_volumes = false

  [plugins."io.containerd.grpc.v1.cri".containerd]
    snapshotter = "overlayfs"
    default_runtime_name = "runc"
    disable_snapshot_annotations = false
    discard_unpacked_layers = false

    [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
      runtime_type = "io.containerd.runc.v2"
      runtime_engine = ""
      runtime_root = ""
      privileged_without_host_devices = false
      base_runtime_spec = ""

      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
        SystemdCgroup = true
        BinaryName = "runc"

  # ========== CNI 配置 ==========
  [plugins."io.containerd.grpc.v1.cri".cni]
    bin_dir = "/opt/cni/bin"
    conf_dir = "/etc/cni/net.d"
    max_conf_num = 1
    conf_template = ""

  # ========== 镜像仓库配置 ==========
  [plugins."io.containerd.grpc.v1.cri".registry]
    [plugins."io.containerd.grpc.v1.cri".registry.mirrors]
      [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
        endpoint = ["https://registry.docker-cn.com", "https://mirror.gcr.io"]

      [plugins."io.containerd.grpc.v1.cri".registry.mirrors."k8s.gcr.io"]
        endpoint = ["https://registry.aliyuncs.com/k8sxio"]

      [plugins."io.containerd.grpc.v1.cri".registry.mirrors."registry.k8s.io"]
        endpoint = ["https://registry.aliyuncs.com/k8sxio"]

  # ========== Pod 统计信息 ==========
  [plugins."io.containerd.grpc.v1.cri".stats]
    collect_period = 10

  # ========== 流程超时 ==========
  [plugins."io.containerd.grpc.v1.cri".streaming]
    streaming_server_address = "127.0.0.1"
    streaming_server_port = "0"
    stream_idle_timeout = "4h0m0s"

[plugins."io.containerd.grpc.v1.cri".image_decryption]
  key_model = "node"

# ========== 性能优化 ==========
[plugins."io.containerd.internal.v1.opt"]
  path = "/opt/containerd"

[plugins."io.containerd.internal.v1.tracing"]
  sampling_ratio = 0.0
  service_name = "containerd"

# ========== 调试配置 ==========
[debug]
  level = "info"
  format = "json"
  address = ""
  uid = 0
  gid = 0

四、监控与告警

4.1 Prometheus 监控配置

```yaml

prometheus/prometheus.yml

global: scrape_interval: 15s evaluation_interval: 15s external_labels: cluster: 'production' environment: 'prod' datacenter: 'cn-east-1'

========== 告警管理器配置 ==========

alerting: alertmanagers:

- static_configs:
    - targets:
        - alertmanager:9093
  scheme: http
  timeout: 10s
  api_version: v2

========== 规则文件 ==========

rule_files:

  • "/etc/prometheus/rules/kubernetes-*.yml"
  • "/etc/prometheus/rules/node-*.yml"
  • "/etc/prometheus/rules/custom-*.yml"

========== 抓取配置 ==========

scrape_configs:

Kubernetes API Server

  • job_name: 'kubernetes-apiservers' kubernetes_sd_configs:

    • role: endpoints scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs:
    • source_labels: [meta_kubernetes_namespace, meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;https

    Kubernetes 节点

  • job_name: 'kubernetes-nodes' kubernetes_sd_configs:

    • role: node scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs:
    • action: labelmap regex: _meta_kubernetes_node_label(.+)
    • targetlabel: _address replacement: kubernetes.default.svc:443
    • sourcelabels: [meta_kubernetes_node_name] regex: (.+) target_label: metricspath replacement: /api/v1/nodes/${1}/proxy/metrics

    Kubernetes Pods

  • job_name: 'kubernetes-pods' kubernetes_sd_configs:
    • role: pod relabel_configs:
    • source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true
    • sourcelabels: [meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: metricspath regex: (.+)
    • sourcelabels: [_address, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace

results matching ""

    No results matching ""