Skip to content

Commit 32ca395

Browse files
committed
Refactor worker restart to prevent issues with periodic tests
1 parent 0c6781f commit 32ca395

File tree

4 files changed

+21
-5
lines changed

4 files changed

+21
-5
lines changed
Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
11
#!/bin/sh
22
set -e
3-
# find nassl worker and restart the container(s)
4-
docker ps --filter label=com.docker.compose.service=worker-nassl --quiet | xargs --no-run-if-empty docker restart
3+
# stop and start worker one at a time to ensure (batch) tasks are still being picked up
4+
# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
5+
for worker in $(docker ps --filter label=com.docker.compose.service=worker-nassl --quiet); do
6+
docker stop "$worker"
7+
docker start "$worker"
8+
# wait for container to be healthy
9+
timeout 300 sh -c "while docker inspect \"$worker\"| jq --exit-status '.[0].State.Health.Status != \"healthy\"' >/dev/null;do sleep 1;done" || true
10+
done

docker/cron-docker/periodic/daily/restart_slow_worker

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@ $COMPOSE_CMD up --no-deps --no-recreate --wait --scale="$SERVICE=$(($REPLICAS*2)
2121
docker rm --force "$OLD_CONTAINERS"
2222

2323
# restore replica number to original
24-
$COMPOSE_CMD scale $SERVICE=$REPLICAS
24+
$COMPOSE_CMD up --no-deps --no-recreate --wait --scale="$SERVICE=$REPLICAS" "$SERVICE"
Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
11
#!/bin/sh
22
set -e
3-
# find worker and restart the container(s)
4-
docker ps --filter label=com.docker.compose.service=worker --quiet | xargs --no-run-if-empty docker restart
3+
# stop and start worker one at a time to ensure (batch) tasks are still being picked up
4+
# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
5+
for worker in $(docker ps --filter label=com.docker.compose.service=worker --quiet); do
6+
docker stop "$worker"
7+
docker start "$worker"
8+
# wait for container to be healthy
9+
timeout 300 sh -c "while docker inspect \"$worker\"| jq --exit-status '.[0].State.Health.Status != \"healthy\"' >/dev/null;do sleep 1;done" || true
10+
done

docker/docker-compose.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,8 @@ services:
249249
# time after which a SIGKILL is sent to celery after a SIGTERM (warm shutdown), default 10s
250250
# insufficient short grace period causes issues on batch when tasks are killed during the hourly worker restart
251251
stop_grace_period: 10m
252+
# SIGTERM is default, but make it explicit
253+
stop_signal: SIGTERM
252254

253255
depends_on:
254256
db-migrate:
@@ -736,6 +738,8 @@ services:
736738
environment:
737739
- AUTO_UPDATE_TO
738740
- WORKER_SLOW_REPLICAS
741+
- WORKER_REPLICAS
742+
- RELEASE
739743

740744
restart: unless-stopped
741745
logging:

0 commit comments

Comments
 (0)