SRE 话题文档:Docker 容器运维

本文档面向生产环境,涵盖 Docker 容器管理、镜像构建、网络存储、安全加固等核心运维场景。


1. 生产环境部署架构

1.1 架构图(ASCII)

┌─────────────────────────────────────────────────────────────────────────────┐
│                        Docker 生产环境架构                                   │
└─────────────────────────────────────────────────────────────────────────────┘

                              ┌─────────────────┐
                              │   CI/CD Pipeline│
                              │  (镜像构建)      │
                              └────────┬────────┘
                                       │
                    ┌──────────────────┼──────────────────┐
                    │                  │                  │
                    ▼                  ▼                  ▼
           ┌───────────────┐  ┌───────────────┐  ┌───────────────┐
           │  Registry     │  │  Registry     │  │  Registry     │
           │  (Harbor)     │  │  (Docker Hub) │  │  (阿里云 ACR)  │
           └───────┬───────┘  └───────┬───────┘  └───────┬───────┘
                   │                  │                  │
                   └──────────────────┼──────────────────┘
                                      │
                    ┌─────────────────┴─────────────────┐
                    │                                   │
                    ▼                                   ▼
           ┌───────────────┐                   ┌───────────────┐
           │  Docker Host  │                   │  Docker Host  │
           │  (Node-01)    │                   │  (Node-02)    │
           │               │                   │               │
           │ ┌───────────┐ │                   │ ┌───────────┐ │
           │ │ Container │ │                   │ │ Container │ │
           │ ├───────────┤ │                   │ ├───────────┤ │
           │ │ Container │ │                   │ │ Container │ │
           │ ├───────────┤ │                   │ ├───────────┤ │
           │ │ Container │ │                   │ │ Container │ │
           │ └───────────┘ │                   │ └───────────┘ │
           │               │                   │               │
           │ ┌───────────┐ │                   │ ┌───────────┐ │
           │ │  Volumes  │ │                   │ │  Volumes  │ │
           │ └───────────┘ │                   │ └───────────┘ │
           └───────────────┘                   └───────────────┘
                    │                                   │
                    └─────────────────┬─────────────────┘
                                      │
                              ┌───────┴───────┐
                              │   Storage     │
                              │ (NFS/Ceph/S3) │
                              └───────────────┘

┌─────────────────────────────────────────────────────────────────────────────┐
│  监控 & 管理组件                                                             │
│  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐         │
│  │ Prometheus  │  │   Grafana   │  │ cAdvisor    │  │ Portainer   │        │
│  │  (监控)      │  │  (可视化)    │  │ (容器指标)   │  │  (管理界面)  │        │
│  └─────────────┘  └─────────────┘  └─────────────┘  └─────────────┘         │
└─────────────────────────────────────────────────────────────────────────────┘

1.2 Docker Compose 部署(监控栈)

# docker-compose.yml - Docker 监控栈
version: '3.8'

services:
  # Prometheus
  prometheus:
    image: prom/prometheus:v2.48.0
    container_name: prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
      - '--web.enable-lifecycle'
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - prometheus-data:/prometheus
    networks:
      - monitoring
    restart: unless-stopped

  # Grafana
  grafana:
    image: grafana/grafana:10.2.0
    container_name: grafana
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=admin123
      - GF_INSTALL_PLUGINS=grafana-clock-panel
    ports:
      - "3000:3000"
    volumes:
      - grafana-data:/var/lib/grafana
    networks:
      - monitoring
    restart: unless-stopped

  # cAdvisor - 容器指标采集
  cadvisor:
    image: gcr.io/cadvisor/cadvisor:v0.47.2
    container_name: cadvisor
    privileged: true
    devices:
      - /dev/kmsg:/dev/kmsg
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
      - /cgroup:/cgroup:ro
    ports:
      - "8080:8080"
    networks:
      - monitoring
    restart: unless-stopped

  # Node Exporter
  node-exporter:
    image: prom/node-exporter:v1.7.0
    container_name: node-exporter
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--path.rootfs=/rootfs'
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    ports:
      - "9100:9100"
    networks:
      - monitoring
    restart: unless-stopped

  # Portainer - Docker 管理界面
  portainer:
    image: portainer/portainer-ce:latest
    container_name: portainer
    command: -H unix:///var/run/docker.sock
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
      - portainer-data:/data
    ports:
      - "9443:9443"
    networks:
      - monitoring
    restart: unless-stopped

  # Registry - 本地镜像仓库
  registry:
    image: registry:2
    container_name: registry
    environment:
      - REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY=/var/lib/registry
      - REGISTRY_HTTP_ADDR=0.0.0.0:5000
    volumes:
      - registry-data:/var/lib/registry
    ports:
      - "5000:5000"
    networks:
      - monitoring
    restart: unless-stopped

volumes:
  prometheus-data:
  grafana-data:
  portainer-data:
  registry-data:

networks:
  monitoring:
    driver: bridge

2. 核心配置与优化

2.1 Docker Daemon 配置

// /etc/docker/daemon.json
{
  // 日志配置
  "log-driver": "json-file",
  "log-opts": {
    "max-size": "100m",
    "max-file": "5",
    "labels": "app,environment",
    "tag": "{{.Name}}/{{.ID}}"
  },

  // 存储驱动
  "storage-driver": "overlay2",
  "storage-opts": [
    "overlay2.override_kernel_check=true",
    "overlay2.size=100G"
  ],

  // 镜像仓库配置
  "registry-mirrors": [
    "https://registry.docker-cn.com",
    "https://mirror.ccs.tencentyun.com"
  ],
  "insecure-registries": [
    "harbor.example.com:5000",
    "localhost:5000"
  ],

  // 网络配置
  "bip": "172.17.0.1/16",
  "default-ulimits": {
    "nofile": {
      "Name": "nofile",
      "Hard": 65535,
      "Soft": 65535
    }
  },

  // 安全配置
  "live-restore": true,
  "userland-proxy": false,
  "no-new-privileges": true,

  // 资源限制
  "default-runtime": "runc",
  "runtimes": {
    "runc": {
      "path": "runc"
    }
  },

  // GC 配置
  "gc": {
    "image": true,
    "container": true
  },
  "image-gc-high-threshold": 85,
  "image-gc-low-threshold": 70,

  // Metrics
  "metrics-addr": "0.0.0.0:9323",
  "experimental": true
}

2.2 Dockerfile 最佳实践

# ==============================================================================
# Dockerfile 最佳实践示例
# ==============================================================================

# 1. 使用多阶段构建
# ==================

# 构建阶段
FROM golang:1.21-alpine AS builder

WORKDIR /app

# 利用缓存,先复制依赖文件
COPY go.mod go.sum ./
RUN go mod download

# 复制源码并构建
COPY . .
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-w -s" -o main .

# 运行阶段
FROM alpine:3.19

# 安装必要工具
RUN apk --no-cache add ca-certificates tzdata

# 创建非 root 用户
RUN addgroup -g 1000 appgroup && \
    adduser -u 1000 -G appgroup -D appuser

WORKDIR /app

# 从构建阶段复制二进制文件
COPY --from=builder /app/main .
COPY --from=builder /app/config ./config

# 设置权限
RUN chown -R appuser:appgroup /app

# 切换用户
USER appuser

# 暴露端口
EXPOSE 8080

# 健康检查
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
    CMD wget --no-verbose --tries=1 --spider http://localhost:8080/health || exit 1

# 启动命令
ENTRYPOINT ["./main"]
CMD ["--config", "config/app.yaml"]


# ==============================================================================
# Node.js 应用 Dockerfile
# ==============================================================================

# 构建阶段
FROM node:20-alpine AS builder

WORKDIR /app

# 复制 package 文件
COPY package*.json ./

# 安装依赖
RUN npm ci --only=production

# 复制源码
COPY . .

# 构建
RUN npm run build

# 运行阶段
FROM node:20-alpine

WORKDIR /app

# 安装 dumb-init
RUN apk add --no-cache dumb-init

# 创建用户
RUN addgroup -g 1000 nodejs && \
    adduser -S -u 1000 -G nodejs nodejs

# 复制依赖和构建产物
COPY --from=builder --chown=nodejs:nodejs /app/node_modules ./node_modules
COPY --from=builder --chown=nodejs:nodejs /app/dist ./dist
COPY --from=builder --chown=nodejs:nodejs /app/package.json ./

USER nodejs

EXPOSE 3000

HEALTHCHECK --interval=30s --timeout=3s \
    CMD node healthcheck.js || exit 1

ENTRYPOINT ["dumb-init", "--"]
CMD ["node", "dist/main.js"]


# ==============================================================================
# Python 应用 Dockerfile
# ==============================================================================

FROM python:3.11-slim AS builder

WORKDIR /app

# 安装系统依赖
RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential \
    && rm -rf /var/lib/apt/lists/*

# 创建虚拟环境
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

# 安装 Python 依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# 运行阶段
FROM python:3.11-slim

WORKDIR /app

# 安装安全更新
RUN apt-get update && apt-get upgrade -y && \
    apt-get clean && rm -rf /var/lib/apt/lists/*

# 创建用户
RUN groupadd -g 1000 appgroup && \
    useradd -u 1000 -g appgroup -m appuser

# 复制虚拟环境
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

# 复制应用
COPY --chown=appuser:appgroup . .

USER appuser

EXPOSE 8000

HEALTHCHECK --interval=30s --timeout=3s \
    CMD python -c "import requests; requests.get('http://localhost:8000/health')" || exit 1

CMD ["gunicorn", "--bind", "0.0.0.0:8000", "--workers", "4", "app:app"]

2.3 Docker Compose 完整示例

# docker-compose.yml - 生产级 Web 应用
version: '3.8'

services:
  # 应用服务
  app:
    build:
      context: .
      dockerfile: Dockerfile
      args:
        - BUILD_ENV=production
    image: myapp:${VERSION:-latest}
    container_name: myapp
    restart: unless-stopped
    ports:
      - "${APP_PORT:-8080}:8080"
    environment:
      - NODE_ENV=production
      - DATABASE_URL=postgres://user:password@postgres:5432/mydb
      - REDIS_URL=redis://redis:6379
      - LOG_LEVEL=info
    env_file:
      - .env.production
    volumes:
      - app-logs:/app/logs
      - app-uploads:/app/uploads
    networks:
      - frontend
      - backend
    depends_on:
      postgres:
        condition: service_healthy
      redis:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
    deploy:
      resources:
        limits:
          cpus: '2'
          memory: 2G
        reservations:
          cpus: '0.5'
          memory: 512M
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 5
        window: 120s
    logging:
      driver: "json-file"
      options:
        max-size: "50m"
        max-file: "5"
    labels:
      - "app.name=myapp"
      - "app.environment=production"
    security_opt:
      - no-new-privileges:true
    read_only: true
    tmpfs:
      - /tmp

  # PostgreSQL 数据库
  postgres:
    image: postgres:15-alpine
    container_name: postgres
    restart: unless-stopped
    environment:
      - POSTGRES_USER=user
      - POSTGRES_PASSWORD=password
      - POSTGRES_DB=mydb
      - PGDATA=/var/lib/postgresql/data/pgdata
    volumes:
      - postgres-data:/var/lib/postgresql/data
    networks:
      - backend
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U user -d mydb"]
      interval: 10s
      timeout: 5s
      retries: 5
    deploy:
      resources:
        limits:
          memory: 1G
        reservations:
          memory: 256M

  # Redis 缓存
  redis:
    image: redis:7-alpine
    container_name: redis
    restart: unless-stopped
    command: redis-server --appendonly yes --maxmemory 256mb --maxmemory-policy allkeys-lru
    volumes:
      - redis-data:/data
    networks:
      - backend
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5

  # Nginx 反向代理
  nginx:
    image: nginx:alpine
    container_name: nginx
    restart: unless-stopped
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf:ro
      - ./ssl:/etc/nginx/ssl:ro
      - nginx-logs:/var/log/nginx
    networks:
      - frontend
    depends_on:
      - app
    healthcheck:
      test: ["CMD", "nginx", "-t"]
      interval: 30s
      timeout: 10s
      retries: 3

volumes:
  app-logs:
  app-uploads:
  postgres-data:
  redis-data:
  nginx-logs:

networks:
  frontend:
    driver: bridge
  backend:
    driver: bridge
    internal: true

3. 网络与存储

3.1 网络配置

# ==============================================================================
# Docker 网络类型
# ==============================================================================

# 1. Bridge 网络(默认)
docker network create --driver bridge my-bridge

# 2. Host 网络(直接使用主机网络)
docker run --network host nginx

# 3. None 网络(无网络)
docker run --network none alpine

# 4. Overlay 网络(Swarm 集群)
docker network create --driver overlay --subnet=10.0.0.0/24 my-overlay

# 5. Macvlan 网络(直接连接物理网络)
docker network create -d macvlan \
    --subnet=192.168.1.0/24 \
    --gateway=192.168.1.1 \
    -o parent=eth0 my-macvlan

# ==============================================================================
# 网络命令
# ==============================================================================

# 列出网络
docker network ls

# 查看网络详情
docker network inspect my-bridge

# 连接容器到网络
docker network connect my-bridge my-container

# 断开连接
docker network disconnect my-bridge my-container

# 删除网络
docker network rm my-bridge

# 容器间通信
# 同一网络内的容器可以通过容器名互相访问
docker run -d --name web --network my-bridge nginx
docker run -it --name client --network my-bridge alpine wget -qO- http://web

3.2 存储卷管理

# ==============================================================================
# Docker 存储类型
# ==============================================================================

# 1. Bind Mount(绑定挂载)
docker run -v /host/path:/container/path nginx

# 2. Volume(命名卷 - 推荐)
docker volume create my-volume
docker run -v my-volume:/container/path nginx

# 3. tmpfs(临时文件系统)
docker run --tmpfs /tmp:rw,size=100m,mode=1777 nginx

# ==============================================================================
# 卷管理命令
# ==============================================================================

# 创建卷
docker volume create my-volume

# 创建卷(带选项)
docker volume create --driver local \
    --opt type=tmpfs \
    --opt device=tmpfs \
    --opt o=size=100m,uid=1000 \
    my-tmpfs-volume

# 列出卷
docker volume ls

# 查看卷详情
docker volume inspect my-volume

# 删除卷
docker volume rm my-volume

# 清理无用卷
docker volume prune

# 备份卷
docker run --rm \
    -v my-volume:/source \
    -v $(pwd)/backup:/backup \
    alpine tar czf /backup/my-volume-backup.tar.gz -C /source .

# 恢复卷
docker run --rm \
    -v my-volume:/target \
    -v $(pwd)/backup:/backup \
    alpine tar xzf /backup/my-volume-backup.tar.gz -C /target

3.3 NFS 存储配置

# docker-compose-nfs.yml
version: '3.8'

services:
  app:
    image: nginx
    volumes:
      - nfs-data:/data

volumes:
  nfs-data:
    driver: local
    driver_opts:
      type: nfs
      o: addr=nfs-server.example.com,rw,nolock,hard,intr
      device: ":/export/data"

4. 常用运维命令

4.1 容器管理

# ==============================================================================
# 容器生命周期管理
# ==============================================================================

# 创建容器
docker create --name my-container nginx:latest

# 启动容器
docker start my-container

# 停止容器(发送 SIGTERM,等待 10s)
docker stop my-container

# 强制停止容器(发送 SIGKILL)
docker kill my-container

# 重启容器
docker restart my-container

# 暂停容器
docker pause my-container

# 恢复容器
docker unpause my-container

# 删除容器
docker rm my-container

# 强制删除运行中的容器
docker rm -f my-container

# 删除所有停止的容器
docker container prune

# ==============================================================================
# 容器操作命令
# ==============================================================================

# 运行容器
docker run -d --name my-nginx -p 80:80 nginx:latest

# 运行并进入容器
docker run -it --name my-alpine alpine:latest /bin/sh

# 在运行中的容器执行命令
docker exec -it my-container /bin/bash

# 查看容器日志
docker logs my-container
docker logs -f --tail 100 my-container

# 查看容器进程
docker top my-container

# 查看容器资源使用
docker stats my-container

# 查看容器详情
docker inspect my-container

# 查看容器端口映射
docker port my-container

# 查看容器文件变更
docker diff my-container

# 导出容器
docker export my-container > my-container.tar

# 导入容器
docker import my-container.tar my-image:latest

# 复制文件
docker cp local-file.txt my-container:/app/
docker cp my-container:/app/log.txt ./local-log.txt

# ==============================================================================
# 容器列表查询
# ==============================================================================

# 列出运行中的容器
docker ps

# 列出所有容器(包括停止的)
docker ps -a

# 过滤容器
docker ps --filter "name=myapp"
docker ps --filter "status=running"
docker ps --filter "label=app=web"

# 格式化输出
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"

# 只显示容器 ID
docker ps -q

4.2 镜像管理

# ==============================================================================
# 镜像操作命令
# ==============================================================================

# 构建镜像
docker build -t my-image:v1.0 .
docker build -t my-image:v1.0 -f Dockerfile.prod .

# 拉取镜像
docker pull nginx:latest
docker pull nginx:latest --platform linux/amd64

# 推送镜像
docker push my-registry.example.com/my-image:v1.0

# 列出镜像
docker images

# 删除镜像
docker rmi my-image:v1.0

# 强制删除镜像
docker rmi -f my-image:v1.0

# 清理无用镜像
docker image prune
docker image prune -a  # 删除所有未使用的镜像

# 查看镜像详情
docker inspect nginx:latest

# 查看镜像历史
docker history nginx:latest

# 镜像标签
docker tag my-image:v1.0 my-registry.example.com/my-image:v1.0

# 保存镜像为 tar 文件
docker save -o my-image.tar my-image:v1.0
docker save my-image:v1.0 | gzip > my-image.tar.gz

# 加载镜像
docker load -i my-image.tar
docker load < my-image.tar.gz

# 查看镜像层
docker image inspect nginx:latest --format='{{.RootFS.Layers}}'

# 镜像安全扫描
docker scout cves nginx:latest
trivy image nginx:latest

4.3 系统管理

# ==============================================================================
# Docker 系统信息
# ==============================================================================

# 查看 Docker 版本
docker version

# 查看 Docker 系统信息
docker info

# 查看 Docker 磁盘使用
docker system df
docker system df -v

# 清理系统
docker system prune           # 清理停止的容器、无用网络、悬空镜像
docker system prune -a        # 清理所有未使用的镜像
docker system prune --volumes # 同时清理卷

# ==============================================================================
# Docker 事件监控
# ==============================================================================

# 实时监控 Docker 事件
docker events

# 过滤事件
docker events --filter 'type=container'
docker events --filter 'event=start'
docker events --filter 'container=my-container'

# 指定时间范围
docker events --since '2024-01-01' --until '2024-01-02'

# ==============================================================================
# Docker 日志管理
# ==============================================================================

# 查看 Docker Daemon 日志
journalctl -u docker.service

# 查看容器日志配置
docker inspect --format='{{.HostConfig.LogConfig}}' my-container

# 清理容器日志(谨慎操作)
truncate -s 0 /var/lib/docker/containers/*/*-json.log

# 设置日志轮转(daemon.json)
{
  "log-driver": "json-file",
  "log-opts": {
    "max-size": "10m",
    "max-file": "3"
  }
}

5. 监控与告警

5.1 Prometheus 配置

# prometheus.yml - Docker 监控
global:
  scrape_interval: 15s

scrape_configs:
  # Docker Daemon 指标
  - job_name: 'docker'
    static_configs:
      - targets: ['docker-host:9323']

  # cAdvisor - 容器指标
  - job_name: 'cadvisor'
    static_configs:
      - targets: ['cadvisor:8080']

  # Node Exporter
  - job_name: 'node'
    static_configs:
      - targets: ['node-exporter:9100']

5.2 告警规则

# docker-alerts.yml
groups:
  - name: docker-alerts
    rules:
      # 容器退出
      - alert: ContainerExited
        expr: container_last_seen < time() - 60
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "容器已退出"
          description: "容器 {{ $labels.name }} 已退出"

      # 容器 CPU 使用过高
      - alert: ContainerHighCPU
        expr: rate(container_cpu_usage_seconds_total{name!=""}[5m]) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "容器 CPU 使用过高"
          description: "容器 {{ $labels.name }} CPU 使用率 {{ $value | printf \"%.1f\" }}%"

      # 容器内存使用过高
      - alert: ContainerHighMemory
        expr: container_memory_usage_bytes / container_spec_memory_limit_bytes * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "容器内存使用过高"
          description: "容器 {{ $labels.name }} 内存使用率 {{ $value | printf \"%.1f\" }}%"

      # 容器频繁重启
      - alert: ContainerRestarting
        expr: increase(container_restart_count[1h]) > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "容器频繁重启"
          description: "容器 {{ $labels.name }} 在过去 1 小时内重启了 {{ $value }} 次"

      # 磁盘空间不足
      - alert: DockerDiskSpaceLow
        expr: ((node_filesystem_size_bytes{mountpoint="/var/lib/docker"} - node_filesystem_avail_bytes{mountpoint="/var/lib/docker"}) / node_filesystem_size_bytes{mountpoint="/var/lib/docker"}) * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Docker 存储空间不足"
          description: "Docker 存储使用率 {{ $value | printf \"%.1f\" }}%"

6. 安全加固

6.1 容器安全配置

# 安全加固配置示例
services:
  app:
    image: nginx:alpine
    # 1. 非 root 用户运行
    user: "1000:1000"

    # 2. 只读文件系统
    read_only: true

    # 3. 临时文件系统
    tmpfs:
      - /tmp
      - /var/cache/nginx
      - /var/run

    # 4. 安全选项
    security_opt:
      - no-new-privileges:true
      - apparmor:docker-default
      - seccomp:seccomp-profile.json

    # 5. 能力限制
    cap_drop:
      - ALL
    cap_add:
      - NET_BIND_SERVICE

    # 6. 禁用特权
    privileged: false

    # 7. 资源限制
    deploy:
      resources:
        limits:
          cpus: '1'
          memory: 512M
          pids: 100

    # 8. 网络限制
    networks:
      - internal

    # 9. 环境变量
    environment:
      - "MYSQL_ROOT_PASSWORD_FILE=/run/secrets/db_password"

    # 10. Secrets
    secrets:
      - db_password

secrets:
  db_password:
    file: ./secrets/db_password.txt

networks:
  internal:
    internal: true

6.2 安全扫描

# ==============================================================================
# 镜像安全扫描
# ==============================================================================

# Docker Scout 扫描
docker scout cves my-image:latest
docker scout quickview my-image:latest
docker scout recommendations my-image:latest

# Trivy 扫描
trivy image my-image:latest
trivy image --severity HIGH,CRITICAL my-image:latest
trivy image --ignore-unfixed my-image:latest

# Grype 扫描
grype my-image:latest
grype my-image:latest --only-fixed

# ==============================================================================
# 运行时安全检查
# ==============================================================================

# 检查容器特权
docker inspect --format='{{.HostConfig.Privileged}}' my-container

# 检查容器能力
docker inspect --format='{{.HostConfig.CapAdd}}' my-container

# 检查容器用户
docker inspect --format='{{.Config.User}}' my-container

# 检查容器挂载
docker inspect --format='{{.Mounts}}' my-container

# Docker Bench Security(安全基线检查)
docker run --rm -it \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -v /etc:/etc \
    -v /usr/bin/docker:/usr/bin/docker \
    docker/docker-bench-security

7. 故障排查

7.1 常见问题排查

# ==============================================================================
# 容器无法启动
# ==============================================================================

# 查看容器日志
docker logs my-container

# 查看容器退出码
docker inspect --format='{{.State.ExitCode}}' my-container

# 查看容器错误信息
docker inspect --format='{{.State.Error}}' my-container

# 常见退出码:
# 0 - 正常退出
# 1 - 应用错误
# 137 - SIGKILL(OOM 或强制停止)
# 139 - 段错误
# 143 - SIGTERM

# ==============================================================================
# 网络问题排查
# ==============================================================================

# 进入容器网络命名空间
docker exec -it my-container sh
ping target-host
nslookup target-host

# 检查端口映射
docker port my-container

# 检查网络配置
docker network inspect my-network

# 抓包分析
docker run --rm --net=container:my-container \
    nicolaka/netshoot tcpdump -i eth0 -nn port 80

# ==============================================================================
# 存储问题排查
# ==============================================================================

# 检查挂载点
docker inspect --format='{{json .Mounts}}' my-container | jq

# 检查磁盘空间
df -h /var/lib/docker

# 检查卷使用
docker system df -v

# 清理无用数据
docker system prune -a --volumes

# ==============================================================================
# 性能问题排查
# ==============================================================================

# 查看容器资源使用
docker stats --no-stream my-container

# 查看容器进程
docker top my-container

# 使用 htop 监控
docker run --rm --pid=container:my-container \
    --privileged \
    nicolaka/netshoot htop

# 查看容器内存详情
docker exec my-container cat /sys/fs/cgroup/memory/memory.stat

7.2 性能调优

# ==============================================================================
# Docker Daemon 调优
# ==============================================================================

# 1. 使用 overlay2 存储驱动
# /etc/docker/daemon.json
{
  "storage-driver": "overlay2"
}

# 2. 调整日志配置
{
  "log-driver": "json-file",
  "log-opts": {
    "max-size": "10m",
    "max-file": "3"
  }
}

# 3. 调整并发下载数
{
  "max-concurrent-downloads": 10,
  "max-concurrent-uploads": 5
}

# 4. 启用 live-restore(升级不停服务)
{
  "live-restore": true
}

# ==============================================================================
# 容器资源限制
# ==============================================================================

# CPU 限制
docker run --cpus="1.5" --cpu-shares=512 my-image

# 内存限制
docker run --memory="1g" --memory-swap="2g" --memory-reservation="512m" my-image

# IO 限制
docker run --device-read-bps=/dev/sda:10mb \
           --device-write-bps=/dev/sda:10mb \
           --device-read-iops=/dev/sda:1000 \
           my-image

# PID 限制(防止 fork 炸弹)
docker run --pids-limit 100 my-image

8. 最佳实践

8.1 镜像构建原则

原则 说明
使用多阶段构建 减小最终镜像体积
使用特定版本标签 避免 latest,确保可重复性
最小化层数 合并 RUN 指令,减少层数
利用构建缓存 先复制依赖文件,再复制源码
非 root 用户 安全第一,避免特权容器
只读文件系统 防止运行时被篡改

8.2 资源限制建议

资源 建议配置 说明
CPU limits: 1-2 cores 防止 CPU 饥饿
内存 limits: 2x requests 预留缓冲空间
PID limit: 100-500 防止 fork 炸弹
日志 max-size: 50-100MB 防止磁盘占满

9. 参考资料


文档版本: 1.0 更新日期: 2024-01-15 适用环境: Docker 24.x,Docker Compose v2

results matching ""

    No results matching ""