SRE 话题文档:Docker 容器运维
本文档面向生产环境,涵盖 Docker 容器管理、镜像构建、网络存储、安全加固等核心运维场景。
1. 生产环境部署架构
1.1 架构图(ASCII)
┌─────────────────────────────────────────────────────────────────────────────┐
│ Docker 生产环境架构 │
└─────────────────────────────────────────────────────────────────────────────┘
┌─────────────────┐
│ CI/CD Pipeline│
│ (镜像构建) │
└────────┬────────┘
│
┌──────────────────┼──────────────────┐
│ │ │
▼ ▼ ▼
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
│ Registry │ │ Registry │ │ Registry │
│ (Harbor) │ │ (Docker Hub) │ │ (阿里云 ACR) │
└───────┬───────┘ └───────┬───────┘ └───────┬───────┘
│ │ │
└──────────────────┼──────────────────┘
│
┌─────────────────┴─────────────────┐
│ │
▼ ▼
┌───────────────┐ ┌───────────────┐
│ Docker Host │ │ Docker Host │
│ (Node-01) │ │ (Node-02) │
│ │ │ │
│ ┌───────────┐ │ │ ┌───────────┐ │
│ │ Container │ │ │ │ Container │ │
│ ├───────────┤ │ │ ├───────────┤ │
│ │ Container │ │ │ │ Container │ │
│ ├───────────┤ │ │ ├───────────┤ │
│ │ Container │ │ │ │ Container │ │
│ └───────────┘ │ │ └───────────┘ │
│ │ │ │
│ ┌───────────┐ │ │ ┌───────────┐ │
│ │ Volumes │ │ │ │ Volumes │ │
│ └───────────┘ │ │ └───────────┘ │
└───────────────┘ └───────────────┘
│ │
└─────────────────┬─────────────────┘
│
┌───────┴───────┐
│ Storage │
│ (NFS/Ceph/S3) │
└───────────────┘
┌─────────────────────────────────────────────────────────────────────────────┐
│ 监控 & 管理组件 │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Prometheus │ │ Grafana │ │ cAdvisor │ │ Portainer │ │
│ │ (监控) │ │ (可视化) │ │ (容器指标) │ │ (管理界面) │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ │
└─────────────────────────────────────────────────────────────────────────────┘
1.2 Docker Compose 部署(监控栈)
# docker-compose.yml - Docker 监控栈
version: '3.8'
services:
# Prometheus
prometheus:
image: prom/prometheus:v2.48.0
container_name: prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus-data:/prometheus
networks:
- monitoring
restart: unless-stopped
# Grafana
grafana:
image: grafana/grafana:10.2.0
container_name: grafana
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_INSTALL_PLUGINS=grafana-clock-panel
ports:
- "3000:3000"
volumes:
- grafana-data:/var/lib/grafana
networks:
- monitoring
restart: unless-stopped
# cAdvisor - 容器指标采集
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.2
container_name: cadvisor
privileged: true
devices:
- /dev/kmsg:/dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /cgroup:/cgroup:ro
ports:
- "8080:8080"
networks:
- monitoring
restart: unless-stopped
# Node Exporter
node-exporter:
image: prom/node-exporter:v1.7.0
container_name: node-exporter
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--path.rootfs=/rootfs'
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
ports:
- "9100:9100"
networks:
- monitoring
restart: unless-stopped
# Portainer - Docker 管理界面
portainer:
image: portainer/portainer-ce:latest
container_name: portainer
command: -H unix:///var/run/docker.sock
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- portainer-data:/data
ports:
- "9443:9443"
networks:
- monitoring
restart: unless-stopped
# Registry - 本地镜像仓库
registry:
image: registry:2
container_name: registry
environment:
- REGISTRY_STORAGE_FILESYSTEM_ROOTDIRECTORY=/var/lib/registry
- REGISTRY_HTTP_ADDR=0.0.0.0:5000
volumes:
- registry-data:/var/lib/registry
ports:
- "5000:5000"
networks:
- monitoring
restart: unless-stopped
volumes:
prometheus-data:
grafana-data:
portainer-data:
registry-data:
networks:
monitoring:
driver: bridge
2. 核心配置与优化
2.1 Docker Daemon 配置
// /etc/docker/daemon.json
{
// 日志配置
"log-driver": "json-file",
"log-opts": {
"max-size": "100m",
"max-file": "5",
"labels": "app,environment",
"tag": "{{.Name}}/{{.ID}}"
},
// 存储驱动
"storage-driver": "overlay2",
"storage-opts": [
"overlay2.override_kernel_check=true",
"overlay2.size=100G"
],
// 镜像仓库配置
"registry-mirrors": [
"https://registry.docker-cn.com",
"https://mirror.ccs.tencentyun.com"
],
"insecure-registries": [
"harbor.example.com:5000",
"localhost:5000"
],
// 网络配置
"bip": "172.17.0.1/16",
"default-ulimits": {
"nofile": {
"Name": "nofile",
"Hard": 65535,
"Soft": 65535
}
},
// 安全配置
"live-restore": true,
"userland-proxy": false,
"no-new-privileges": true,
// 资源限制
"default-runtime": "runc",
"runtimes": {
"runc": {
"path": "runc"
}
},
// GC 配置
"gc": {
"image": true,
"container": true
},
"image-gc-high-threshold": 85,
"image-gc-low-threshold": 70,
// Metrics
"metrics-addr": "0.0.0.0:9323",
"experimental": true
}
2.2 Dockerfile 最佳实践
# ==============================================================================
# Dockerfile 最佳实践示例
# ==============================================================================
# 1. 使用多阶段构建
# ==================
# 构建阶段
FROM golang:1.21-alpine AS builder
WORKDIR /app
# 利用缓存,先复制依赖文件
COPY go.mod go.sum ./
RUN go mod download
# 复制源码并构建
COPY . .
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-w -s" -o main .
# 运行阶段
FROM alpine:3.19
# 安装必要工具
RUN apk --no-cache add ca-certificates tzdata
# 创建非 root 用户
RUN addgroup -g 1000 appgroup && \
adduser -u 1000 -G appgroup -D appuser
WORKDIR /app
# 从构建阶段复制二进制文件
COPY --from=builder /app/main .
COPY --from=builder /app/config ./config
# 设置权限
RUN chown -R appuser:appgroup /app
# 切换用户
USER appuser
# 暴露端口
EXPOSE 8080
# 健康检查
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
CMD wget --no-verbose --tries=1 --spider http://localhost:8080/health || exit 1
# 启动命令
ENTRYPOINT ["./main"]
CMD ["--config", "config/app.yaml"]
# ==============================================================================
# Node.js 应用 Dockerfile
# ==============================================================================
# 构建阶段
FROM node:20-alpine AS builder
WORKDIR /app
# 复制 package 文件
COPY package*.json ./
# 安装依赖
RUN npm ci --only=production
# 复制源码
COPY . .
# 构建
RUN npm run build
# 运行阶段
FROM node:20-alpine
WORKDIR /app
# 安装 dumb-init
RUN apk add --no-cache dumb-init
# 创建用户
RUN addgroup -g 1000 nodejs && \
adduser -S -u 1000 -G nodejs nodejs
# 复制依赖和构建产物
COPY --from=builder --chown=nodejs:nodejs /app/node_modules ./node_modules
COPY --from=builder --chown=nodejs:nodejs /app/dist ./dist
COPY --from=builder --chown=nodejs:nodejs /app/package.json ./
USER nodejs
EXPOSE 3000
HEALTHCHECK --interval=30s --timeout=3s \
CMD node healthcheck.js || exit 1
ENTRYPOINT ["dumb-init", "--"]
CMD ["node", "dist/main.js"]
# ==============================================================================
# Python 应用 Dockerfile
# ==============================================================================
FROM python:3.11-slim AS builder
WORKDIR /app
# 安装系统依赖
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*
# 创建虚拟环境
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# 安装 Python 依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 运行阶段
FROM python:3.11-slim
WORKDIR /app
# 安装安全更新
RUN apt-get update && apt-get upgrade -y && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# 创建用户
RUN groupadd -g 1000 appgroup && \
useradd -u 1000 -g appgroup -m appuser
# 复制虚拟环境
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# 复制应用
COPY --chown=appuser:appgroup . .
USER appuser
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=3s \
CMD python -c "import requests; requests.get('http://localhost:8000/health')" || exit 1
CMD ["gunicorn", "--bind", "0.0.0.0:8000", "--workers", "4", "app:app"]
2.3 Docker Compose 完整示例
# docker-compose.yml - 生产级 Web 应用
version: '3.8'
services:
# 应用服务
app:
build:
context: .
dockerfile: Dockerfile
args:
- BUILD_ENV=production
image: myapp:${VERSION:-latest}
container_name: myapp
restart: unless-stopped
ports:
- "${APP_PORT:-8080}:8080"
environment:
- NODE_ENV=production
- DATABASE_URL=postgres://user:password@postgres:5432/mydb
- REDIS_URL=redis://redis:6379
- LOG_LEVEL=info
env_file:
- .env.production
volumes:
- app-logs:/app/logs
- app-uploads:/app/uploads
networks:
- frontend
- backend
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
deploy:
resources:
limits:
cpus: '2'
memory: 2G
reservations:
cpus: '0.5'
memory: 512M
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 5
window: 120s
logging:
driver: "json-file"
options:
max-size: "50m"
max-file: "5"
labels:
- "app.name=myapp"
- "app.environment=production"
security_opt:
- no-new-privileges:true
read_only: true
tmpfs:
- /tmp
# PostgreSQL 数据库
postgres:
image: postgres:15-alpine
container_name: postgres
restart: unless-stopped
environment:
- POSTGRES_USER=user
- POSTGRES_PASSWORD=password
- POSTGRES_DB=mydb
- PGDATA=/var/lib/postgresql/data/pgdata
volumes:
- postgres-data:/var/lib/postgresql/data
networks:
- backend
healthcheck:
test: ["CMD-SHELL", "pg_isready -U user -d mydb"]
interval: 10s
timeout: 5s
retries: 5
deploy:
resources:
limits:
memory: 1G
reservations:
memory: 256M
# Redis 缓存
redis:
image: redis:7-alpine
container_name: redis
restart: unless-stopped
command: redis-server --appendonly yes --maxmemory 256mb --maxmemory-policy allkeys-lru
volumes:
- redis-data:/data
networks:
- backend
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
# Nginx 反向代理
nginx:
image: nginx:alpine
container_name: nginx
restart: unless-stopped
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
- ./ssl:/etc/nginx/ssl:ro
- nginx-logs:/var/log/nginx
networks:
- frontend
depends_on:
- app
healthcheck:
test: ["CMD", "nginx", "-t"]
interval: 30s
timeout: 10s
retries: 3
volumes:
app-logs:
app-uploads:
postgres-data:
redis-data:
nginx-logs:
networks:
frontend:
driver: bridge
backend:
driver: bridge
internal: true
3. 网络与存储
3.1 网络配置
# ==============================================================================
# Docker 网络类型
# ==============================================================================
# 1. Bridge 网络(默认)
docker network create --driver bridge my-bridge
# 2. Host 网络(直接使用主机网络)
docker run --network host nginx
# 3. None 网络(无网络)
docker run --network none alpine
# 4. Overlay 网络(Swarm 集群)
docker network create --driver overlay --subnet=10.0.0.0/24 my-overlay
# 5. Macvlan 网络(直接连接物理网络)
docker network create -d macvlan \
--subnet=192.168.1.0/24 \
--gateway=192.168.1.1 \
-o parent=eth0 my-macvlan
# ==============================================================================
# 网络命令
# ==============================================================================
# 列出网络
docker network ls
# 查看网络详情
docker network inspect my-bridge
# 连接容器到网络
docker network connect my-bridge my-container
# 断开连接
docker network disconnect my-bridge my-container
# 删除网络
docker network rm my-bridge
# 容器间通信
# 同一网络内的容器可以通过容器名互相访问
docker run -d --name web --network my-bridge nginx
docker run -it --name client --network my-bridge alpine wget -qO- http://web
3.2 存储卷管理
# ==============================================================================
# Docker 存储类型
# ==============================================================================
# 1. Bind Mount(绑定挂载)
docker run -v /host/path:/container/path nginx
# 2. Volume(命名卷 - 推荐)
docker volume create my-volume
docker run -v my-volume:/container/path nginx
# 3. tmpfs(临时文件系统)
docker run --tmpfs /tmp:rw,size=100m,mode=1777 nginx
# ==============================================================================
# 卷管理命令
# ==============================================================================
# 创建卷
docker volume create my-volume
# 创建卷(带选项)
docker volume create --driver local \
--opt type=tmpfs \
--opt device=tmpfs \
--opt o=size=100m,uid=1000 \
my-tmpfs-volume
# 列出卷
docker volume ls
# 查看卷详情
docker volume inspect my-volume
# 删除卷
docker volume rm my-volume
# 清理无用卷
docker volume prune
# 备份卷
docker run --rm \
-v my-volume:/source \
-v $(pwd)/backup:/backup \
alpine tar czf /backup/my-volume-backup.tar.gz -C /source .
# 恢复卷
docker run --rm \
-v my-volume:/target \
-v $(pwd)/backup:/backup \
alpine tar xzf /backup/my-volume-backup.tar.gz -C /target
3.3 NFS 存储配置
# docker-compose-nfs.yml
version: '3.8'
services:
app:
image: nginx
volumes:
- nfs-data:/data
volumes:
nfs-data:
driver: local
driver_opts:
type: nfs
o: addr=nfs-server.example.com,rw,nolock,hard,intr
device: ":/export/data"
4. 常用运维命令
4.1 容器管理
# ==============================================================================
# 容器生命周期管理
# ==============================================================================
# 创建容器
docker create --name my-container nginx:latest
# 启动容器
docker start my-container
# 停止容器(发送 SIGTERM,等待 10s)
docker stop my-container
# 强制停止容器(发送 SIGKILL)
docker kill my-container
# 重启容器
docker restart my-container
# 暂停容器
docker pause my-container
# 恢复容器
docker unpause my-container
# 删除容器
docker rm my-container
# 强制删除运行中的容器
docker rm -f my-container
# 删除所有停止的容器
docker container prune
# ==============================================================================
# 容器操作命令
# ==============================================================================
# 运行容器
docker run -d --name my-nginx -p 80:80 nginx:latest
# 运行并进入容器
docker run -it --name my-alpine alpine:latest /bin/sh
# 在运行中的容器执行命令
docker exec -it my-container /bin/bash
# 查看容器日志
docker logs my-container
docker logs -f --tail 100 my-container
# 查看容器进程
docker top my-container
# 查看容器资源使用
docker stats my-container
# 查看容器详情
docker inspect my-container
# 查看容器端口映射
docker port my-container
# 查看容器文件变更
docker diff my-container
# 导出容器
docker export my-container > my-container.tar
# 导入容器
docker import my-container.tar my-image:latest
# 复制文件
docker cp local-file.txt my-container:/app/
docker cp my-container:/app/log.txt ./local-log.txt
# ==============================================================================
# 容器列表查询
# ==============================================================================
# 列出运行中的容器
docker ps
# 列出所有容器(包括停止的)
docker ps -a
# 过滤容器
docker ps --filter "name=myapp"
docker ps --filter "status=running"
docker ps --filter "label=app=web"
# 格式化输出
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
# 只显示容器 ID
docker ps -q
4.2 镜像管理
# ==============================================================================
# 镜像操作命令
# ==============================================================================
# 构建镜像
docker build -t my-image:v1.0 .
docker build -t my-image:v1.0 -f Dockerfile.prod .
# 拉取镜像
docker pull nginx:latest
docker pull nginx:latest --platform linux/amd64
# 推送镜像
docker push my-registry.example.com/my-image:v1.0
# 列出镜像
docker images
# 删除镜像
docker rmi my-image:v1.0
# 强制删除镜像
docker rmi -f my-image:v1.0
# 清理无用镜像
docker image prune
docker image prune -a # 删除所有未使用的镜像
# 查看镜像详情
docker inspect nginx:latest
# 查看镜像历史
docker history nginx:latest
# 镜像标签
docker tag my-image:v1.0 my-registry.example.com/my-image:v1.0
# 保存镜像为 tar 文件
docker save -o my-image.tar my-image:v1.0
docker save my-image:v1.0 | gzip > my-image.tar.gz
# 加载镜像
docker load -i my-image.tar
docker load < my-image.tar.gz
# 查看镜像层
docker image inspect nginx:latest --format='{{.RootFS.Layers}}'
# 镜像安全扫描
docker scout cves nginx:latest
trivy image nginx:latest
4.3 系统管理
# ==============================================================================
# Docker 系统信息
# ==============================================================================
# 查看 Docker 版本
docker version
# 查看 Docker 系统信息
docker info
# 查看 Docker 磁盘使用
docker system df
docker system df -v
# 清理系统
docker system prune # 清理停止的容器、无用网络、悬空镜像
docker system prune -a # 清理所有未使用的镜像
docker system prune --volumes # 同时清理卷
# ==============================================================================
# Docker 事件监控
# ==============================================================================
# 实时监控 Docker 事件
docker events
# 过滤事件
docker events --filter 'type=container'
docker events --filter 'event=start'
docker events --filter 'container=my-container'
# 指定时间范围
docker events --since '2024-01-01' --until '2024-01-02'
# ==============================================================================
# Docker 日志管理
# ==============================================================================
# 查看 Docker Daemon 日志
journalctl -u docker.service
# 查看容器日志配置
docker inspect --format='{{.HostConfig.LogConfig}}' my-container
# 清理容器日志(谨慎操作)
truncate -s 0 /var/lib/docker/containers/*/*-json.log
# 设置日志轮转(daemon.json)
{
"log-driver": "json-file",
"log-opts": {
"max-size": "10m",
"max-file": "3"
}
}
5. 监控与告警
5.1 Prometheus 配置
# prometheus.yml - Docker 监控
global:
scrape_interval: 15s
scrape_configs:
# Docker Daemon 指标
- job_name: 'docker'
static_configs:
- targets: ['docker-host:9323']
# cAdvisor - 容器指标
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
# Node Exporter
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
5.2 告警规则
# docker-alerts.yml
groups:
- name: docker-alerts
rules:
# 容器退出
- alert: ContainerExited
expr: container_last_seen < time() - 60
for: 1m
labels:
severity: warning
annotations:
summary: "容器已退出"
description: "容器 {{ $labels.name }} 已退出"
# 容器 CPU 使用过高
- alert: ContainerHighCPU
expr: rate(container_cpu_usage_seconds_total{name!=""}[5m]) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "容器 CPU 使用过高"
description: "容器 {{ $labels.name }} CPU 使用率 {{ $value | printf \"%.1f\" }}%"
# 容器内存使用过高
- alert: ContainerHighMemory
expr: container_memory_usage_bytes / container_spec_memory_limit_bytes * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "容器内存使用过高"
description: "容器 {{ $labels.name }} 内存使用率 {{ $value | printf \"%.1f\" }}%"
# 容器频繁重启
- alert: ContainerRestarting
expr: increase(container_restart_count[1h]) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "容器频繁重启"
description: "容器 {{ $labels.name }} 在过去 1 小时内重启了 {{ $value }} 次"
# 磁盘空间不足
- alert: DockerDiskSpaceLow
expr: ((node_filesystem_size_bytes{mountpoint="/var/lib/docker"} - node_filesystem_avail_bytes{mountpoint="/var/lib/docker"}) / node_filesystem_size_bytes{mountpoint="/var/lib/docker"}) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Docker 存储空间不足"
description: "Docker 存储使用率 {{ $value | printf \"%.1f\" }}%"
6. 安全加固
6.1 容器安全配置
# 安全加固配置示例
services:
app:
image: nginx:alpine
# 1. 非 root 用户运行
user: "1000:1000"
# 2. 只读文件系统
read_only: true
# 3. 临时文件系统
tmpfs:
- /tmp
- /var/cache/nginx
- /var/run
# 4. 安全选项
security_opt:
- no-new-privileges:true
- apparmor:docker-default
- seccomp:seccomp-profile.json
# 5. 能力限制
cap_drop:
- ALL
cap_add:
- NET_BIND_SERVICE
# 6. 禁用特权
privileged: false
# 7. 资源限制
deploy:
resources:
limits:
cpus: '1'
memory: 512M
pids: 100
# 8. 网络限制
networks:
- internal
# 9. 环境变量
environment:
- "MYSQL_ROOT_PASSWORD_FILE=/run/secrets/db_password"
# 10. Secrets
secrets:
- db_password
secrets:
db_password:
file: ./secrets/db_password.txt
networks:
internal:
internal: true
6.2 安全扫描
# ==============================================================================
# 镜像安全扫描
# ==============================================================================
# Docker Scout 扫描
docker scout cves my-image:latest
docker scout quickview my-image:latest
docker scout recommendations my-image:latest
# Trivy 扫描
trivy image my-image:latest
trivy image --severity HIGH,CRITICAL my-image:latest
trivy image --ignore-unfixed my-image:latest
# Grype 扫描
grype my-image:latest
grype my-image:latest --only-fixed
# ==============================================================================
# 运行时安全检查
# ==============================================================================
# 检查容器特权
docker inspect --format='{{.HostConfig.Privileged}}' my-container
# 检查容器能力
docker inspect --format='{{.HostConfig.CapAdd}}' my-container
# 检查容器用户
docker inspect --format='{{.Config.User}}' my-container
# 检查容器挂载
docker inspect --format='{{.Mounts}}' my-container
# Docker Bench Security(安全基线检查)
docker run --rm -it \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /etc:/etc \
-v /usr/bin/docker:/usr/bin/docker \
docker/docker-bench-security
7. 故障排查
7.1 常见问题排查
# ==============================================================================
# 容器无法启动
# ==============================================================================
# 查看容器日志
docker logs my-container
# 查看容器退出码
docker inspect --format='{{.State.ExitCode}}' my-container
# 查看容器错误信息
docker inspect --format='{{.State.Error}}' my-container
# 常见退出码:
# 0 - 正常退出
# 1 - 应用错误
# 137 - SIGKILL(OOM 或强制停止)
# 139 - 段错误
# 143 - SIGTERM
# ==============================================================================
# 网络问题排查
# ==============================================================================
# 进入容器网络命名空间
docker exec -it my-container sh
ping target-host
nslookup target-host
# 检查端口映射
docker port my-container
# 检查网络配置
docker network inspect my-network
# 抓包分析
docker run --rm --net=container:my-container \
nicolaka/netshoot tcpdump -i eth0 -nn port 80
# ==============================================================================
# 存储问题排查
# ==============================================================================
# 检查挂载点
docker inspect --format='{{json .Mounts}}' my-container | jq
# 检查磁盘空间
df -h /var/lib/docker
# 检查卷使用
docker system df -v
# 清理无用数据
docker system prune -a --volumes
# ==============================================================================
# 性能问题排查
# ==============================================================================
# 查看容器资源使用
docker stats --no-stream my-container
# 查看容器进程
docker top my-container
# 使用 htop 监控
docker run --rm --pid=container:my-container \
--privileged \
nicolaka/netshoot htop
# 查看容器内存详情
docker exec my-container cat /sys/fs/cgroup/memory/memory.stat
7.2 性能调优
# ==============================================================================
# Docker Daemon 调优
# ==============================================================================
# 1. 使用 overlay2 存储驱动
# /etc/docker/daemon.json
{
"storage-driver": "overlay2"
}
# 2. 调整日志配置
{
"log-driver": "json-file",
"log-opts": {
"max-size": "10m",
"max-file": "3"
}
}
# 3. 调整并发下载数
{
"max-concurrent-downloads": 10,
"max-concurrent-uploads": 5
}
# 4. 启用 live-restore(升级不停服务)
{
"live-restore": true
}
# ==============================================================================
# 容器资源限制
# ==============================================================================
# CPU 限制
docker run --cpus="1.5" --cpu-shares=512 my-image
# 内存限制
docker run --memory="1g" --memory-swap="2g" --memory-reservation="512m" my-image
# IO 限制
docker run --device-read-bps=/dev/sda:10mb \
--device-write-bps=/dev/sda:10mb \
--device-read-iops=/dev/sda:1000 \
my-image
# PID 限制(防止 fork 炸弹)
docker run --pids-limit 100 my-image
8. 最佳实践
8.1 镜像构建原则
| 原则 | 说明 |
|---|---|
| 使用多阶段构建 | 减小最终镜像体积 |
| 使用特定版本标签 | 避免 latest,确保可重复性 |
| 最小化层数 | 合并 RUN 指令,减少层数 |
| 利用构建缓存 | 先复制依赖文件,再复制源码 |
| 非 root 用户 | 安全第一,避免特权容器 |
| 只读文件系统 | 防止运行时被篡改 |
8.2 资源限制建议
| 资源 | 建议配置 | 说明 |
|---|---|---|
| CPU | limits: 1-2 cores | 防止 CPU 饥饿 |
| 内存 | limits: 2x requests | 预留缓冲空间 |
| PID | limit: 100-500 | 防止 fork 炸弹 |
| 日志 | max-size: 50-100MB | 防止磁盘占满 |
9. 参考资料
- Docker 官方文档: https://docs.docker.com/
- Dockerfile 最佳实践: https://docs.docker.com/develop/develop-images/dockerfile_best-practices/
- Docker 安全: https://docs.docker.com/engine/security/
- cAdvisor: https://github.com/google/cadvisor
- Trivy: https://github.com/aquasecurity/trivy
文档版本: 1.0 更新日期: 2024-01-15 适用环境: Docker 24.x,Docker Compose v2