Test upgrade from latest release to current branch image in CI #2059
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Test deployment and reimage on OpenStack | |
on: | |
workflow_dispatch: | |
push: | |
branches: | |
- main | |
paths: | |
- '**' | |
- '!dev/**' | |
- 'dev/setup-env.sh' | |
- '!docs/**' | |
- '!README.md' | |
- '!.gitignore' | |
- '!.github/workflows/' | |
- '.github/workflows/stackhpc' | |
pull_request: | |
paths: | |
- '**' | |
- '!dev/**' | |
- 'dev/setup-env.sh' | |
- '!docs/**' | |
- '!README.md' | |
- '!.gitignore' | |
- '!.github/workflows/' | |
- '.github/workflows/stackhpc' | |
jobs: | |
openstack: | |
name: openstack-ci | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS | |
cancel-in-progress: true | |
runs-on: ubuntu-22.04 | |
strategy: | |
fail-fast: false # allow other matrix jobs to continue even if one fails | |
matrix: | |
os_version: | |
- RL8 | |
- RL9 | |
env: | |
ANSIBLE_FORCE_COLOR: True | |
OS_CLOUD: openstack | |
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }} | |
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings | |
TF_VAR_os_version: ${{ matrix.os_version }} | |
steps: | |
- name: Find the latest release | |
run: | | |
echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV" | |
echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG | |
- name: Checkout latest release | |
uses: actions/checkout@v4 | |
with: | |
ref: $LATEST_RELEASE_TAG | |
- name: Override CI_CLOUD if PR label is present | |
if: ${{ github.event_name == 'pull_request' }} | |
run: | | |
# Iterate over the labels | |
labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name') | |
echo $labels | |
for label in $labels; do | |
if [[ $label == CI_CLOUD=* ]]; then | |
# Extract the value after 'CI_CLOUD=' | |
CI_CLOUD_OVERRIDE=${label#CI_CLOUD=} | |
echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV | |
fi | |
done | |
- name: Record settings for CI cloud | |
run: | | |
echo CI_CLOUD: ${{ env.CI_CLOUD }} | |
- name: Setup ssh | |
run: | | |
set -x | |
mkdir ~/.ssh | |
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa | |
chmod 0600 ~/.ssh/id_rsa | |
shell: bash | |
- name: Add bastion's ssh key to known_hosts | |
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts | |
shell: bash | |
- name: Install ansible etc | |
run: dev/setup-env.sh | |
- name: Install OpenTofu | |
uses: opentofu/setup-opentofu@v1 | |
with: | |
tofu_version: 1.6.2 | |
- name: Initialise tofu | |
run: tofu init | |
working-directory: ${{ github.workspace }}/environments/.stackhpc/tofu | |
- name: Write clouds.yaml | |
run: | | |
mkdir -p ~/.config/openstack/ | |
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml | |
shell: bash | |
- name: Setup environment-specific inventory/tofu inputs | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook ansible/adhoc/generate-passwords.yml | |
echo vault_demo_user_password: "$DEMO_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml | |
env: | |
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} | |
- name: Provision nodes using latest release image | |
id: provision_servers | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu | |
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" | |
- name: Delete infrastructure if provisioning failed | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu | |
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" | |
if: failure() && steps.provision_servers.outcome == 'failure' | |
- name: Configure cluster at latest release | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible all -m wait_for_connection | |
ansible-playbook -v ansible/site.yml | |
ansible-playbook -v ansible/ci/check_slurm.yml | |
- name: Run MPI-based tests at latest release | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/adhoc/hpctests.yml | |
# - name: Run EESSI tests | |
# run: | | |
# . venv/bin/activate | |
# . environments/.stackhpc/activate | |
# ansible-playbook -vv ansible/ci/check_eessi.yml | |
- name: Confirm Open Ondemand is up (via SOCKS proxy) | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
# load ansible variables into shell: | |
ansible-playbook ansible/ci/output_vars.yml \ | |
-e output_vars_hosts=openondemand \ | |
-e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \ | |
-e output_vars_items=bastion_ip,bastion_user,openondemand_servername | |
source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt | |
# setup ssh proxying: | |
sudo apt-get --yes install proxychains | |
echo proxychains installed | |
ssh -v -fN -D 9050 ${bastion_user}@${bastion_ip} | |
echo port 9050 forwarded | |
# check OOD server returns 200: | |
statuscode=$(proxychains wget \ | |
--quiet \ | |
--spider \ | |
--server-response \ | |
--no-check-certificate \ | |
--http-user=demo_user \ | |
--http-password=${DEMO_USER_PASSWORD} https://${openondemand_servername} \ | |
2>&1) | |
(echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1) | |
env: | |
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} | |
- name: Checkout current branch | |
uses: actions/checkout@v4 | |
- name: Reimage login and control nodes to image in current branch | |
id: reimage_non_compute | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu | |
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" -var-file=cluster_image.latest.tfvars.json | |
- name: Configure cluster using current branch | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible all -m wait_for_connection | |
ansible-playbook -v ansible/site.yml | |
ansible-playbook -v ansible/ci/check_slurm.yml | |
- name: Reimage compute nodes to image in current branch using slurm - tests compute-init | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml | |
ansible-playbook -v ansible/ci/check_slurm.yml | |
- name: Check sacct state survived reimage to current branch | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml | |
- name: Check MPI-based tests are shown in Grafana | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/ci/check_grafana.yml | |
- name: Run MPI-based tests again in current branch | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/adhoc/hpctests.yml | |
- name: Delete infrastructure | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu | |
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" | |
if: ${{ success() || cancelled() }} |