Skip to content

Test upgrade from latest release to current branch image in CI #2067

Test upgrade from latest release to current branch image in CI

Test upgrade from latest release to current branch image in CI #2067

Workflow file for this run

name: Test deployment and reimage on OpenStack
on:
workflow_dispatch:
push:
branches:
- main
paths:
- '**'
- '!dev/**'
- 'dev/setup-env.sh'
- '!docs/**'
- '!README.md'
- '!.gitignore'
- '!.github/workflows/'
- '.github/workflows/stackhpc'
pull_request:
paths:
- '**'
- '!dev/**'
- 'dev/setup-env.sh'
- '!docs/**'
- '!README.md'
- '!.gitignore'
- '!.github/workflows/'
- '.github/workflows/stackhpc'
jobs:
openstack:
name: openstack-ci
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix:
os_version:
- RL8
- RL9
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
TF_VAR_os_version: ${{ matrix.os_version }}
steps:
- name: Find the latest release
run: |
echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV"
- name: Checkout latest release
uses: actions/checkout@v4
with:
ref: ${{ env.LATEST_RELEASE_TAG }}
- name: Find stackhpc tofu/terraform directory
# changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541
run: |
echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name terraform -o -name tofu )) >> "$GITHUB_ENV"
- name: Override CI_CLOUD if PR label is present
if: ${{ github.event_name == 'pull_request' }}
run: |
# Iterate over the labels
labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name')
echo $labels
for label in $labels; do
if [[ $label == CI_CLOUD=* ]]; then
# Extract the value after 'CI_CLOUD='
CI_CLOUD_OVERRIDE=${label#CI_CLOUD=}
echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV
fi
done
- name: Record debug info
run: |
echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG
echo STACKHPC_TF_DIR: $STACKHPC_TF_DIR
echo CI_CLOUD: $CI_CLOUD
- name: Setup ssh
run: |
set -x
mkdir ~/.ssh
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
shell: bash
- name: Add bastion's ssh key to known_hosts
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
shell: bash
- name: Install ansible etc
run: dev/setup-env.sh
- name: Install OpenTofu
uses: opentofu/setup-opentofu@v1
with:
tofu_version: 1.6.2
- name: Initialise tofu
run: tofu init
working-directory: ${{ env.STACKHPC_TF_DIR }}
- name: Write clouds.yaml
run: |
mkdir -p ~/.config/openstack/
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
shell: bash
- name: Setup environment-specific inventory/tofu inputs
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook ansible/adhoc/generate-passwords.yml
echo vault_demo_user_password: "$DEMO_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
env:
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
- name: Provision nodes using latest release image
id: provision_servers
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $STACKHPC_TF_DIR
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
- name: Delete infrastructure if provisioning failed
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $STACKHPC_TF_DIR
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
if: failure() && steps.provision_servers.outcome == 'failure'
- name: Configure cluster at latest release
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible all -m wait_for_connection
ansible-playbook -v ansible/site.yml
ansible-playbook -v ansible/ci/check_slurm.yml
- name: Run MPI-based tests at latest release
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/adhoc/hpctests.yml
# - name: Run EESSI tests
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# ansible-playbook -vv ansible/ci/check_eessi.yml
- name: Confirm Open Ondemand is up (via SOCKS proxy)
run: |
. venv/bin/activate
. environments/.stackhpc/activate
# load ansible variables into shell:
ansible-playbook ansible/ci/output_vars.yml \
-e output_vars_hosts=openondemand \
-e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \
-e output_vars_items=bastion_ip,bastion_user,openondemand_servername
source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt
# setup ssh proxying:
sudo apt-get --yes install proxychains
echo proxychains installed
ssh -v -fN -D 9050 ${bastion_user}@${bastion_ip}
echo port 9050 forwarded
# check OOD server returns 200:
statuscode=$(proxychains wget \
--quiet \
--spider \
--server-response \
--no-check-certificate \
--http-user=demo_user \
--http-password=${DEMO_USER_PASSWORD} https://${openondemand_servername} \
2>&1)
(echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1)
env:
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
- name: Checkout current branch
uses: actions/checkout@v4
- name: Find stackhpc tofu/terraform directory
# changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541
run: |
echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name terraform -o -name tofu \) ) >> "$GITHUB_ENV"
- name: Reimage login and control nodes to image in current branch
id: reimage_non_compute
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $STACKHPC_TF_DIR
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" -var-file=cluster_image.latest.tfvars.json
- name: Configure cluster using current branch
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible all -m wait_for_connection
ansible-playbook -v ansible/site.yml
ansible-playbook -v ansible/ci/check_slurm.yml
- name: Reimage compute nodes to image in current branch using slurm - tests compute-init
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
ansible-playbook -v ansible/ci/check_slurm.yml
- name: Check sacct state survived reimage to current branch
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
- name: Check MPI-based tests are shown in Grafana
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_grafana.yml
- name: Run MPI-based tests again in current branch
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/adhoc/hpctests.yml
- name: Delete infrastructure
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $STACKHPC_TF_DIR
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
if: ${{ success() || cancelled() }}