包括:
- 完整的
autoheal.sh
(支持每分钟检查一次、连续 5 次 unhealthy 才重启) Dockerfile
docker-compose.yml
- 详细文档,包含参数说明、用法
1️⃣ autoheal.sh
#!/usr/bin/env sh
set -e
set -o pipefailDOCKER_SOCK=${DOCKER_SOCK:-/var/run/docker.sock}
CURL_TIMEOUT=${CURL_TIMEOUT:-30}
WEBHOOK_URL=${WEBHOOK_URL:-""}HTTP_ENDPOINT="http://localhost"
UNIX_SOCK="--unix-socket ${DOCKER_SOCK}"AUTOHEAL_CONTAINER_LABEL=${AUTOHEAL_CONTAINER_LABEL:-autoheal}
AUTOHEAL_START_PERIOD=${AUTOHEAL_START_PERIOD:-0}
AUTOHEAL_INTERVAL=${AUTOHEAL_INTERVAL:-60} # 默认每分钟检查一次
AUTOHEAL_DEFAULT_STOP_TIMEOUT=${AUTOHEAL_DEFAULT_STOP_TIMEOUT:-10}
AUTOHEAL_MAX_FAILS=${AUTOHEAL_MAX_FAILS:-5} # 连续5次才重启STATE_FILE="/tmp/autoheal_counts"
touch "$STATE_FILE"docker_curl() {curl --max-time "${CURL_TIMEOUT}" --no-buffer -s \${UNIX_SOCK} \"$@"
}get_container_info() {local label_filterif [ "$AUTOHEAL_CONTAINER_LABEL" = "all" ]; thenlabel_filter=""elselabel_filter=",\"label\":\[\"${AUTOHEAL_CONTAINER_LABEL}=true\"\]"fiurl="${HTTP_ENDPOINT}/containers/json?filters=\{\"health\":\[\"unhealthy\"\]${label_filter}\}"docker_curl "$url"
}restart_container() {local container_id="$1"local timeout="$2"docker_curl -f -X POST "${HTTP_ENDPOINT}/containers/${container_id}/restart?t=${timeout}"
}get_count() {grep "^$1 " "$STATE_FILE" | awk '{print $2}'
}set_count() {grep -v "^$1 " "$STATE_FILE" > "${STATE_FILE}.tmp" 2>/dev/null || trueecho "$1 $2" >> "${STATE_FILE}.tmp"mv "${STATE_FILE}.tmp" "$STATE_FILE"
}trap 'kill $$; exit 143' SIGTERMif [ "$1" = "autoheal" ] && [ -e "$DOCKER_SOCK" ]; then[ "$AUTOHEAL_START_PERIOD" -gt 0 ] && sleep "$AUTOHEAL_START_PERIOD"while true; doSTOP_TIMEOUT=".Labels[\"autoheal.stop.timeout\"] // $AUTOHEAL_DEFAULT_STOP_TIMEOUT"get_container_info | \jq -r "foreach .[] as \$C([];[];\$C | .Id, .Names[0], .State, ${STOP_TIMEOUT})" | \while read -r CONTAINER_ID && read -r CONTAINER_NAME && read -r CONTAINER_STATE && read -r TIMEOUT; doCONTAINER_SHORT_ID=${CONTAINER_ID:0:12}DATE=$(date +%F" "%T)COUNT=$(get_count "$CONTAINER_ID")COUNT=${COUNT:-0}if [ "$CONTAINER_NAME" = "null" ]; thenecho "$DATE Container ${CONTAINER_SHORT_ID} name null, skip"elif [ "$CONTAINER_STATE" = "restarting" ]; thenecho "$DATE Container $CONTAINER_NAME ($CONTAINER_SHORT_ID) restarting, skip"elseCOUNT=$((COUNT + 1))echo "$DATE Container $CONTAINER_NAME ($CONTAINER_SHORT_ID) unhealthy count=$COUNT"if [ $COUNT -ge $AUTOHEAL_MAX_FAILS ]; thenecho "$DATE Container $CONTAINER_NAME ($CONTAINER_SHORT_ID) reached $AUTOHEAL_MAX_FAILS, restarting"if restart_container "$CONTAINER_ID" "$TIMEOUT"; thenecho "$DATE Restarted container $CONTAINER_SHORT_ID ok"elseecho "$DATE Restarted container $CONTAINER_SHORT_ID failed" >&2fiCOUNT=0fiset_count "$CONTAINER_ID" "$COUNT"fidonesleep "$AUTOHEAL_INTERVAL"done
elseexec "$@"
fi
2️⃣ Dockerfile
FROM alpine:3.19# 安装依赖
RUN apk add --no-cache bash curl jq# 拷贝脚本
COPY autoheal.sh /usr/local/bin/autoheal.sh
RUN chmod +x /usr/local/bin/autoheal.shENTRYPOINT ["/usr/local/bin/autoheal.sh"]
CMD ["autoheal"]
构建:
docker build -t my-autoheal .
3️⃣ docker-compose.yml
version: "3.8"
services:autoheal:image: my-autoheal:latestcontainer_name: autohealrestart: unless-stoppedvolumes:- /var/run/docker.sock:/var/run/docker.sockenvironment:# === 参数示例 ===- DOCKER_SOCK=/var/run/docker.sock- AUTOHEAL_CONTAINER_LABEL=all- AUTOHEAL_START_PERIOD=0- AUTOHEAL_INTERVAL=60- AUTOHEAL_MAX_FAILS=5- AUTOHEAL_DEFAULT_STOP_TIMEOUT=10# - WEBHOOK_URL=https://your-webhook
启动:
docker-compose up -d
4️⃣ 参数说明
环境变量 | 说明 | 默认值 |
---|---|---|
DOCKER_SOCK |
Docker Daemon 的 socket 或 TCP 地址 | /var/run/docker.sock |
AUTOHEAL_CONTAINER_LABEL |
要监控的容器标签:autoheal (只监控打了 autoheal=true 标签的容器),all (所有容器) |
autoheal |
AUTOHEAL_START_PERIOD |
容器启动后延迟多少秒再开始监控 | 0 |
AUTOHEAL_INTERVAL |
检查间隔秒数 | 60 |
AUTOHEAL_MAX_FAILS |
连续多少次 unhealthy 才重启 |
5 |
AUTOHEAL_DEFAULT_STOP_TIMEOUT |
停止容器的超时时间(秒) | 10 |
WEBHOOK_URL |
重启成功/失败时发送通知的 Webhook URL(可选) | 空 |
如果用 TCP 远程 Docker API,还可以设置证书路径(自己扩展 CA_CERT、CLIENT_CERT、CLIENT_KEY 等)。
5️⃣ 使用示例
你的业务容器:
services:myapp:image: myapp:latestlabels:- autoheal=true # 如果 AUTOHEAL_CONTAINER_LABEL=autoheal 时
再用上面的 autoheal
服务即可实现:
- 每分钟检查一次容器健康
- 连续 5 次
unhealthy
才重启 - 重启成功/失败可通过 Webhook 通知