Skip to content

Commit

Permalink
shake up and settle the ci env vars
Browse files Browse the repository at this point in the history
  • Loading branch information
ric-evans committed Oct 4, 2024
1 parent 06c25fa commit be6dc29
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 63 deletions.
42 changes: 21 additions & 21 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@ env:
BOT_NAME: wipacdevbot
BOT_EMAIL: [email protected]
#
DOCKER_IMAGE_TAG: icecube/skymap_scanner:local
CI_DOCKER_IMAGE_TAG: icecube/skymap_scanner:local
#
CI_TEST_RUN_STDOUT_STDERR_DIR: /home/runner/work/skymap_scanner/testrun_outputs
N_WORKERS: 2
REALTIME_EVENTS_DIR: /home/runner/work/skymap_scanner/skymap_scanner/tests/data/realtime_events
SKYSCAN_CACHE_DIR: /home/runner/work/skymap_scanner/skymap_scanner/cache
SKYSCAN_OUTPUT_DIR: /home/runner/work/skymap_scanner/skymap_scanner/output
SKYSCAN_DEBUG_DIR: /home/runner/work/skymap_scanner/skymap_scanner/debug
CI_SKYSCAN_CACHE_DIR: /home/runner/work/skymap_scanner/skymap_scanner/cache
CI_SKYSCAN_OUTPUT_DIR: /home/runner/work/skymap_scanner/skymap_scanner/output
CI_SKYSCAN_DEBUG_DIR: /home/runner/work/skymap_scanner/skymap_scanner/debug
# see source tests/env-vars.sh


Expand Down Expand Up @@ -115,7 +115,7 @@ jobs:
with:
context: .
file: ./Dockerfile
tags: ${{ env.DOCKER_IMAGE_TAG }}
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true


Expand Down Expand Up @@ -155,14 +155,14 @@ jobs:
with:
context: .
file: ./Dockerfile
tags: ${{ env.DOCKER_IMAGE_TAG }}
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- uses: eWaterCycle/setup-apptainer@v2
with:
apptainer-version: 1.3.2
- name: build singularity image
run: |
sudo singularity build skymap_scanner.sif docker-daemon://$DOCKER_IMAGE_TAG
sudo singularity build skymap_scanner.sif docker-daemon://$CI_DOCKER_IMAGE_TAG
ls -lh skymap_scanner.sif
- name: run singularity container
run: |
Expand All @@ -179,8 +179,8 @@ jobs:
- name: look at results file (.npz)
run: |
ls .
ls $SKYSCAN_OUTPUT_DIR
outfile=$(ls -d $SKYSCAN_OUTPUT_DIR/*.npz)
ls $CI_SKYSCAN_OUTPUT_DIR
outfile=$(ls -d $CI_SKYSCAN_OUTPUT_DIR/*.npz)
echo $outfile
- name: central server stdout/stderr
Expand Down Expand Up @@ -260,7 +260,7 @@ jobs:
with:
context: .
file: ./Dockerfile
tags: ${{ env.DOCKER_IMAGE_TAG }}
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- name: run
# timeout-minutes: 10 # on average max~=5min
Expand All @@ -277,9 +277,9 @@ jobs:
- name: check no nsides skipped
run: |
ls $SKYSCAN_OUTPUT_DIR
ls $CI_SKYSCAN_OUTPUT_DIR
# get newest run*.json
export outfile=$(find $SKYSCAN_OUTPUT_DIR -type f -name "run*.json" -exec stat -c '%y %n' {} + | sort | tail -1 | awk '{print $4}')
export outfile=$(find $CI_SKYSCAN_OUTPUT_DIR -type f -name "run*.json" -exec stat -c '%y %n' {} + | sort | tail -1 | awk '{print $4}')
echo $outfile
python3 -c '
import json
Expand Down Expand Up @@ -357,7 +357,7 @@ jobs:
with:
context: .
file: ./Dockerfile
tags: ${{ env.DOCKER_IMAGE_TAG }}
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- name: run
# timeout-minutes: 10 # on average ~6min # yes, `timeout` is used below but this is insurance
Expand Down Expand Up @@ -474,7 +474,7 @@ jobs:
with:
context: .
file: ./Dockerfile
tags: ${{ env.DOCKER_IMAGE_TAG }}
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- name: run
# timeout-minutes: 35 # on average max~=26min
Expand All @@ -492,9 +492,9 @@ jobs:
- name: test output against known result (.json)
run: |
ls $SKYSCAN_OUTPUT_DIR
ls $CI_SKYSCAN_OUTPUT_DIR
# get newest run*.json
outfile=$(find $SKYSCAN_OUTPUT_DIR -type f -name "run*.json" -exec stat -c '%y %n' {} + | sort | tail -1 | awk '{print $4}')
outfile=$(find $CI_SKYSCAN_OUTPUT_DIR -type f -name "run*.json" -exec stat -c '%y %n' {} + | sort | tail -1 | awk '{print $4}')
echo $outfile
cat $outfile
pip install . # don't need icecube, so no docker container needed
Expand Down Expand Up @@ -542,15 +542,15 @@ jobs:
with:
context: .
file: ./Dockerfile
tags: ${{ env.DOCKER_IMAGE_TAG }}
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- name: run
run: |
source tests/env-vars.sh
docker run --rm -i \
$(env | grep '^SKYSCAN_' | cut -d'=' -f1 | sed 's/^/--env /') \
$DOCKER_IMAGE_TAG \
$CI_DOCKER_IMAGE_TAG \
python tests/file_staging.py
Expand Down Expand Up @@ -584,7 +584,7 @@ jobs:
with:
context: .
file: ./Dockerfile
tags: ${{ env.DOCKER_IMAGE_TAG }}
tags: ${{ env.CI_DOCKER_IMAGE_TAG }}
load: true
- name: run
# timeout-minutes: 15 # on average max~=7min
Expand All @@ -598,7 +598,7 @@ jobs:
--mount type=bind,source=$(readlink -f tests/data/reco_pixel_single/${{ matrix.reco_algo }}/${{ matrix.dir }}),target=/local/test-data \
--env PY_COLORS=1 \
$(env | grep '^SKYSCAN_' | cut -d'=' -f1 | sed 's/^/--env /') \
$DOCKER_IMAGE_TAG \
$CI_DOCKER_IMAGE_TAG \
python -m skymap_scanner.client.reco_icetray \
--infile /local/test-data/in.json \
--client-startup-json /local/test-data/startup.json \
Expand All @@ -617,7 +617,7 @@ jobs:
--mount type=bind,source=$(readlink -f tests/data/reco_pixel_single/${{ matrix.reco_algo }}/${{ matrix.dir }}),target=/local/test-data \
--env PY_COLORS=1 \
$(env | grep '^SKYSCAN_' | cut -d'=' -f1 | sed 's/^/--env /') \
$DOCKER_IMAGE_TAG \
$CI_DOCKER_IMAGE_TAG \
python tests/compare_reco_pixel_single.py \
--actual /local/test-data/out-actual.json \
--expected /local/test-data/out.json \
Expand Down
4 changes: 2 additions & 2 deletions resources/launch_scripts/launch_worker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ if [ -n "$_RUN_THIS_SINGULARITY_IMAGE" ]; then
cp "$_RUN_THIS_SINGULARITY_IMAGE" "$EWMS_PILOT_TASK_IMAGE"
export _EWMS_PILOT_CONTAINER_PLATFORM="apptainer"
else
export EWMS_PILOT_TASK_IMAGE="$DOCKER_IMAGE_TAG"
export EWMS_PILOT_TASK_IMAGE="$CI_DOCKER_IMAGE_TAG"
export _EWMS_PILOT_CONTAINER_PLATFORM="docker" # NOTE: technically not needed b/c this is the default value
export _EWMS_PILOT_DOCKER_SHM_SIZE="6gb" # this only needed in ci--the infra would set this in prod
fi
Expand All @@ -45,14 +45,14 @@ export EWMS_PILOT_QUEUE_INCOMING="$SKYSCAN_MQ_TOCLIENT"
export EWMS_PILOT_QUEUE_INCOMING_AUTH_TOKEN="$SKYSCAN_MQ_TOCLIENT_AUTH_TOKEN"
export EWMS_PILOT_QUEUE_INCOMING_BROKER_TYPE="$SKYSCAN_MQ_TOCLIENT_BROKER_TYPE"
export EWMS_PILOT_QUEUE_INCOMING_BROKER_ADDRESS="$SKYSCAN_MQ_TOCLIENT_BROKER_ADDRESS"
export EWMS_PILOT_TIMEOUT_QUEUE_INCOMING="$SKYSCAN_MQ_TIMEOUT_TO_CLIENTS"
#
# from-client queue
export EWMS_PILOT_QUEUE_OUTGOING="$SKYSCAN_MQ_FROMCLIENT"
export EWMS_PILOT_QUEUE_OUTGOING_AUTH_TOKEN="$SKYSCAN_MQ_FROMCLIENT_AUTH_TOKEN"
export EWMS_PILOT_QUEUE_OUTGOING_BROKER_TYPE="$SKYSCAN_MQ_FROMCLIENT_BROKER_TYPE"
export EWMS_PILOT_QUEUE_OUTGOING_BROKER_ADDRESS="$SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRESS"

# run!
ENV="$(dirname $tmp_rootdir)/pyenv-$(basename $tmp_rootdir)"
pip install virtualenv
virtualenv --python python3 "$ENV"
Expand Down
25 changes: 12 additions & 13 deletions resources/launch_scripts/local-scan.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,13 @@ mkdir -p "$outdir"
########################################################################
# Check required env vars

if [ -z "$SKYSCAN_CACHE_DIR" ] || [ -z "$SKYSCAN_OUTPUT_DIR" ] || [ -z "$SKYSCAN_DEBUG_DIR" ]; then
echo "required env vars: SKYSCAN_CACHE_DIR, SKYSCAN_OUTPUT_DIR, SKYSCAN_DEBUG_DIR"
# will fail in mkdirs below...
if [ -z "$CI_SKYSCAN_CACHE_DIR" ] || [ -z "$CI_SKYSCAN_OUTPUT_DIR" ] || [ -z "$CI_SKYSCAN_DEBUG_DIR" ]; then
echo "required env vars: CI_SKYSCAN_CACHE_DIR, CI_SKYSCAN_OUTPUT_DIR, CI_SKYSCAN_DEBUG_DIR"
exit 2
fi
mkdir $SKYSCAN_CACHE_DIR
mkdir $SKYSCAN_OUTPUT_DIR
mkdir $SKYSCAN_DEBUG_DIR
mkdir $CI_SKYSCAN_CACHE_DIR
mkdir $CI_SKYSCAN_OUTPUT_DIR
mkdir $CI_SKYSCAN_DEBUG_DIR

########################################################################
# Misc setup
Expand All @@ -65,8 +65,8 @@ if [ -n "$_RUN_THIS_SINGULARITY_IMAGE" ]; then
python -m skymap_scanner.server \
--reco-algo $_RECO_ALGO \
--event-file $_EVENTS_FILE \
--cache-dir $SKYSCAN_CACHE_DIR \
--output-dir $SKYSCAN_OUTPUT_DIR \
--cache-dir $CI_SKYSCAN_CACHE_DIR \
--output-dir $CI_SKYSCAN_OUTPUT_DIR \
--client-startup-json $CI_SKYSCAN_STARTUP_JSON \
--nsides $_NSIDES \
--simulated-event \
Expand All @@ -76,14 +76,13 @@ else
# DOCKER
docker run --network="host" --rm \
--mount type=bind,source="$(dirname "$_EVENTS_FILE")",target=/local/event,readonly \
--mount type=bind,source="$SKYSCAN_CACHE_DIR",target=/local/cache \
--mount type=bind,source="$SKYSCAN_OUTPUT_DIR",target=/local/output \
--mount type=bind,source="$CI_SKYSCAN_CACHE_DIR",target=/local/cache \
--mount type=bind,source="$CI_SKYSCAN_OUTPUT_DIR",target=/local/output \
--mount type=bind,source="$(dirname "$CI_SKYSCAN_STARTUP_JSON")",target=/local/startup \
--env PY_COLORS=1 \
$(env | grep '^SKYSCAN_' | cut -d'=' -f1 | sed 's/^/--env /') \
$(env | grep '^EWMS_' | cut -d'=' -f1 | sed 's/^/--env /') \
--env "EWMS_PILOT_TASK_TIMEOUT=${EWMS_PILOT_TASK_TIMEOUT:-900}" \
icecube/skymap_scanner:"${SKYSCAN_DOCKER_IMAGE_TAG:-"latest"}" \
"$CI_DOCKER_IMAGE_TAG" \
python -m skymap_scanner.server \
--reco-algo $_RECO_ALGO \
--event-file "/local/event/$(basename "$_EVENTS_FILE")" \
Expand All @@ -101,7 +100,7 @@ pidmap["$!"]="central server"
########################################################################
# Wait for startup.json

./wait_for_file.sh $CI_SKYSCAN_STARTUP_JSON $WAIT_FOR_STARTUP_JSON
./wait_for_file.sh $CI_SKYSCAN_STARTUP_JSON 60

########################################################################
# Launch Workers that each run a Pilot which each run Skyscan Clients
Expand Down
13 changes: 0 additions & 13 deletions skymap_scanner/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,24 +143,11 @@ class EnvConfig:

# TIMEOUTS
#
# seconds -- how long client waits between receiving pixels before thinking event scan is 100% done
# - set to `max(reco duration) + max(subsequent iteration startup time)`
# - think about starved clients
# - normal expiration scenario: the scan is done, no more pixels to scan (alternative: manually kill client process)
SKYSCAN_MQ_TIMEOUT_TO_CLIENTS: int = 60 * 30 # 30 mins
#
# seconds -- how long server waits before thinking all clients are dead
# - set to duration of first reco + client launch (condor)
# - important if clients launch *AFTER* server
# - normal expiration scenario: all clients died (bad condor submit file), otherwise never (server knows when all recos are done)
SKYSCAN_MQ_TIMEOUT_FROM_CLIENTS: int = 3 * 24 * 60 * 60 # 3 days
#
# seconds -- how long client waits before first message (set to duration of server startup)
# - important if clients launch *BEFORE* server
# - normal expiration scenario: server died (ex: tried to read corrupted event file), otherwise never
SKYSCAN_MQ_CLIENT_TIMEOUT_WAIT_FOR_FIRST_MESSAGE: int = 60 * 60 # 60 mins

EWMS_PILOT_TASK_TIMEOUT: int = 60 * 30

# SKYDRIVER VARS
SKYSCAN_SKYDRIVER_ADDRESS: str = "" # SkyDriver REST interface address
Expand Down
2 changes: 1 addition & 1 deletion skymap_scanner/server/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def get_mqclient_connections() -> tuple[mq.Queue, mq.Queue]:
address=cfg.ENV.SKYSCAN_MQ_TOCLIENT_BROKER_ADDRESS,
name=cfg.ENV.SKYSCAN_MQ_TOCLIENT,
auth_token=cfg.ENV.SKYSCAN_MQ_TOCLIENT_AUTH_TOKEN,
timeout=cfg.ENV.SKYSCAN_MQ_TIMEOUT_TO_CLIENTS,
# timeout=-1, # NOTE: this mq only sends messages so no timeout needed
)
from_clients_queue = mq.Queue(
cfg.ENV.SKYSCAN_MQ_FROMCLIENT_BROKER_TYPE,
Expand Down
36 changes: 23 additions & 13 deletions tests/env-vars.sh
Original file line number Diff line number Diff line change
@@ -1,38 +1,48 @@
#!/bin/bash
set -ex # file is sourced so turn off at end

export SKYSCAN_SKYDRIVER_SCAN_ID=$(uuidgen)
########################################################################
#
# Export many environment variables needed to run a local scan
#
# NOTE: source this file
#
########################################################################

# export SKYSCAN_CACHE_DIR=$PWD/cache-dir -- rely on user value
# export SKYSCAN_OUTPUT_DIR=$PWD/output-dir -- rely on user value
export SKYSCAN_SKYDRIVER_SCAN_ID=$(uuidgen)

# to-client queue
# -> server
export SKYSCAN_MQ_TOCLIENT="to-clients-$SKYSCAN_SKYDRIVER_SCAN_ID"
export SKYSCAN_MQ_TOCLIENT_AUTH_TOKEN=${SKYSCAN_MQ_TOCLIENT_AUTH_TOKEN:-""} # note: set in ci job
export SKYSCAN_MQ_TOCLIENT_BROKER_TYPE=${SKYSCAN_MQ_TOCLIENT_BROKER_TYPE:-"rabbitmq"}
export SKYSCAN_MQ_TOCLIENT_BROKER_ADDRESS=${SKYSCAN_MQ_TOCLIENT_BROKER_ADDRESS:-""} # note: set in ci job
# -> worker/client/pilot
# note: set in launch_worker.sh
#
# from-client queue
# -> server
export SKYSCAN_MQ_FROMCLIENT="from-clients-$SKYSCAN_SKYDRIVER_SCAN_ID"
export SKYSCAN_MQ_FROMCLIENT_AUTH_TOKEN=${SKYSCAN_MQ_FROMCLIENT_AUTH_TOKEN:-""} # note: set in ci job
export SKYSCAN_MQ_FROMCLIENT_BROKER_TYPE=${SKYSCAN_MQ_FROMCLIENT_BROKER_TYPE:-"rabbitmq"}
export SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRESS=${SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRESS:-""} # note: set in ci job
# -> worker/client/pilot
# note: set in launch_worker.sh

export EWMS_PILOT_TASK_TIMEOUT=${EWMS_PILOT_TASK_TIMEOUT:-1800} # TODO - adjust
# timeouts -- these are listed in order of occurrence
# -> worker/client/pilot
export EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE=${EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE:-60}

export EWMS_PILOT_KEEP_ALL_TASK_FILES="True" # don't delete stderr/stdout files

# export SKYSCAN_DEBUG_DIR=debug-pkl-dir -- rely on user value
export SKYSCAN_MQ_TIMEOUT_TO_CLIENTS=${SKYSCAN_MQ_TIMEOUT_TO_CLIENTS:-5}
export EWMS_PILOT_TIMEOUT_QUEUE_INCOMING=${EWMS_PILOT_TIMEOUT_QUEUE_INCOMING:-5}
export EWMS_PILOT_TASK_TIMEOUT=${EWMS_PILOT_TASK_TIMEOUT:-1800} # TODO - adjust
# -> server
export SKYSCAN_MQ_TIMEOUT_FROM_CLIENTS=${EWMS_PILOT_TASK_TIMEOUT:-600} # TODO - adjust
# export SKYSCAN_MQ_CLIENT_TIMEOUT_WAIT_FOR_FIRST_MESSAGE=0

export SKYSCAN_DOCKER_IMAGE_TAG=${SKYSCAN_DOCKER_IMAGE_TAG:-"local"}
# other/misc
# -> worker/client/pilot
export SKYSCAN_MINI_TEST=${SKYSCAN_MINI_TEST:-'yes'}
export SKYSCAN_LOG=${SKYSCAN_LOG:-"DEBUG"}
export SKYSCAN_LOG_THIRD_PARTY=${SKYSCAN_LOG_THIRD_PARTY:-"INFO"}

export WAIT_FOR_STARTUP_JSON=${WAIT_FOR_STARTUP_JSON:-60}
# -> worker/client/pilot
export EWMS_PILOT_KEEP_ALL_TASK_FILES="True" # don't delete stderr/stdout files

set +ex # file is sourced so turn off

0 comments on commit be6dc29

Please sign in to comment.