Skip to content

Commit

Permalink
Merge branch 'main' into feat/baremetal_host_v2
Browse files Browse the repository at this point in the history
  • Loading branch information
priteau authored Jan 16, 2025
2 parents 7c65424 + 5f7e48f commit ec81869
Show file tree
Hide file tree
Showing 123 changed files with 2,189 additions and 385 deletions.
139 changes: 139 additions & 0 deletions .github/workflows/extra.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
name: Test extra build
on:
workflow_dispatch:
push:
branches:
- main
paths:
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
- 'ansible/roles/doca/**'
- 'ansible/roles/cuda/**'
- 'ansible/roles/lustre/**'
- '.github/workflows/extra.yml'
pull_request:
paths:
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
- 'ansible/roles/doca/**'
- 'ansible/roles/cuda/**'
- 'ansible/roles/lustre/**'
- '.github/workflows/extra.yml'

jobs:
doca:
name: extra-build
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix: # build RL8, RL9
build:
- image_name: openhpc-extra-RL8
source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
inventory_groups: doca,cuda,lustre
volume_size: 30 # needed for cuda
- image_name: openhpc-extra-RL9
source_image_name_key: RL9
inventory_groups: doca,cuda,lustre
volume_size: 30 # needed for cuda
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}

steps:
- uses: actions/checkout@v2

- name: Load current fat images into GITHUB_ENV
# see https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#example-of-a-multiline-string
run: |
{
echo 'FAT_IMAGES<<EOF'
cat environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
echo EOF
} >> "$GITHUB_ENV"
- name: Record settings
run: |
echo CI_CLOUD: ${{ env.CI_CLOUD }}
echo FAT_IMAGES: ${FAT_IMAGES}
- name: Setup ssh
run: |
set -x
mkdir ~/.ssh
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
shell: bash

- name: Add bastion's ssh key to known_hosts
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
shell: bash

- name: Install ansible etc
run: dev/setup-env.sh

- name: Write clouds.yaml
run: |
mkdir -p ~/.config/openstack/
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
shell: bash

- name: Setup environment
run: |
. venv/bin/activate
. environments/.stackhpc/activate
- name: Build fat image with packer
id: packer_build
run: |
set -x
. venv/bin/activate
. environments/.stackhpc/activate
cd packer/
packer init .
PACKER_LOG=1 packer build \
-on-error=${{ vars.PACKER_ON_ERROR }} \
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
-var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \
-var "image_name=${{ matrix.build.image_name }}" \
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
-var "volume_size=${{ matrix.build.volume_size }}" \
openstack.pkr.hcl
- name: Get created image names from manifest
id: manifest
run: |
. venv/bin/activate
IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json)
while ! openstack image show -f value -c name $IMAGE_ID; do
sleep 5
done
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
echo $IMAGE_ID > image-id.txt
echo $IMAGE_NAME > image-name.txt
- name: Make image usable for further builds
run: |
. venv/bin/activate
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
- name: Delete image for automatically-run workflows
run: |
. venv/bin/activate
openstack image delete "${{ steps.manifest.outputs.image-id }}"
if: ${{ github.event_name != 'workflow_dispatch' }}

- name: Upload manifest artifact
uses: actions/upload-artifact@v4
with:
name: image-details-${{ matrix.build.image_name }}
path: |
./image-id.txt
./image-name.txt
overwrite: true
40 changes: 20 additions & 20 deletions .github/workflows/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,25 @@ jobs:
openstack:
name: openstack-imagebuild
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix: # build RL8, RL9
os_version:
- RL8
- RL9
build:
- openstack.openhpc
- image_name: openhpc-RL8
source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.qcow2
inventory_groups: control,compute,login,update
- image_name: openhpc-RL9
source_image_name: Rocky-9-GenericCloud-Base-9.5-20241118.0.x86_64.qcow2
inventory_groups: control,compute,login,update
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
CI_CLOUD: ${{ github.event.inputs.ci_cloud }}
SOURCE_IMAGES_MAP: |
{
"RL8": {
"openstack.openhpc": "rocky-latest-RL8"
},
"RL9": {
"openstack.openhpc": "rocky-latest-RL9"
}
}
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}
LEAFCLOUD_PULP_PASSWORD: ${{ secrets.LEAFCLOUD_PULP_PASSWORD }}

steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -84,13 +79,11 @@ jobs:
PACKER_LOG=1 packer build \
-on-error=${{ vars.PACKER_ON_ERROR }} \
-only=${{ matrix.build }} \
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
-var "source_image_name=${{ env.SOURCE_IMAGE }}" \
-var "source_image_name=${{ matrix.build.source_image_name }}" \
-var "image_name=${{ matrix.build.image_name }}" \
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
openstack.pkr.hcl
env:
PKR_VAR_os_version: ${{ matrix.os_version }}
SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }}
- name: Get created image names from manifest
id: manifest
Expand All @@ -101,13 +94,20 @@ jobs:
sleep 5
done
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
echo $IMAGE_ID > image-id.txt
echo $IMAGE_NAME > image-name.txt
- name: Make image usable for further builds
run: |
. venv/bin/activate
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
- name: Upload manifest artifact
uses: actions/upload-artifact@v4
with:
name: image-details-${{ matrix.build }}-${{ matrix.os_version }}
name: image-details-${{ matrix.build.image_name }}
path: |
./image-id.txt
./image-name.txt
Expand Down
28 changes: 24 additions & 4 deletions .github/workflows/nightly-cleanup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,31 @@ jobs:
for cluster_prefix in ${ci_clusters}
do
echo "Processing cluster: $cluster_prefix"
TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value)
if [[ $TAGS =~ "keep" ]]; then
echo "Skipping ${cluster_prefix} - control instance is tagged as keep"
# Get all servers with the matching name for control node
CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json)
SERVER_COUNT=$(echo "$CONTROL_SERVERS" | jq length)
if [[ $SERVER_COUNT -gt 1 ]]; then
echo "Multiple servers found for control node '${cluster_prefix}-control'. Checking tags for each..."
for server in $(echo "$CONTROL_SERVERS" | jq -r '.[].ID'); do
# Get tags for each control node
TAGS=$(openstack server show "$server" --column tags --format value)
if [[ $TAGS =~ "keep" ]]; then
echo "Skipping ${cluster_prefix} (server ${server}) - control instance is tagged as keep"
else
./dev/delete-cluster.py ${cluster_prefix} --force
fi
done
else
./dev/delete-cluster.py ${cluster_prefix} --force
# If only one server, extract its tags and proceed
TAGS=$(echo "$CONTROL_SERVERS" | jq -r '.[0].Tags')
if [[ $TAGS =~ "keep" ]]; then
echo "Skipping ${cluster_prefix} - control instance is tagged as keep"
else
./dev/delete-cluster.py ${cluster_prefix} --force
fi
fi
done
shell: bash
57 changes: 23 additions & 34 deletions .github/workflows/nightlybuild.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,32 +11,31 @@ on:
- SMS
- ARCUS
schedule:
- cron: '0 0 * * *' # Run at midnight
- cron: '0 0 * * *' # Run at midnight on default branch

jobs:
openstack:
name: openstack-imagebuild
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix: # build RL8, RL9
os_version:
- RL8
- RL9
build:
- openstack.rocky-latest
- image_name: rocky-latest-RL8
source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.qcow2
inventory_groups: update
- image_name: rocky-latest-RL9
source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2
inventory_groups: update
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }}
SOURCE_IMAGES_MAP: |
{
"RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2",
"RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2"
}
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}
LEAFCLOUD_PULP_PASSWORD: ${{ secrets.LEAFCLOUD_PULP_PASSWORD }}

steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -82,15 +81,13 @@ jobs:
PACKER_LOG=1 packer build \
-on-error=${{ vars.PACKER_ON_ERROR }} \
-only=${{ matrix.build }} \
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
-var "source_image_name=${{ env.SOURCE_IMAGE }}" \
-var "source_image_name=${{ matrix.build.source_image_name }}" \
-var "image_name=${{ matrix.build.image_name }}" \
-var "image_name_version=" \
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
openstack.pkr.hcl
env:
PKR_VAR_os_version: ${{ matrix.os_version }}
SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version] }}

- name: Get created image names from manifest
id: manifest
run: |
Expand Down Expand Up @@ -124,7 +121,7 @@ jobs:
name: upload-nightly-targets
needs: openstack
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.image }}-${{ matrix.target_cloud }}
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }}-${{ matrix.target_cloud }}
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
Expand All @@ -134,18 +131,15 @@ jobs:
- LEAFCLOUD
- SMS
- ARCUS
os_version:
- RL8
- RL9
image:
- rocky-latest
build:
- image_name: rocky-latest-RL8
- image_name: rocky-latest-RL9
exclude:
- target_cloud: LEAFCLOUD
env:
OS_CLOUD: openstack
SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }}
TARGET_CLOUD: ${{ matrix.target_cloud }}
IMAGE_NAME: "${{ matrix.image }}-${{ matrix.os_version }}"
steps:
- uses: actions/checkout@v2

Expand All @@ -160,42 +154,37 @@ jobs:
. venv/bin/activate
pip install -U pip
pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
shell: bash
- name: Write clouds.yaml
run: |
mkdir -p ~/.config/openstack/
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.SOURCE_CLOUD)] }}" > ~/.config/openstack/source_clouds.yaml
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.TARGET_CLOUD)] }}" > ~/.config/openstack/target_clouds.yaml
shell: bash
- name: Download source image
run: |
. venv/bin/activate
export OS_CLIENT_CONFIG_FILE=~/.config/openstack/source_clouds.yaml
openstack image save --file ${{ env.IMAGE_NAME }} ${{ env.IMAGE_NAME }}
shell: bash
openstack image save --file ${{ matrix.build.image_name }} ${{ matrix.build.image_name }}
- name: Upload to target cloud
run: |
. venv/bin/activate
export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml
openstack image create "${{ env.IMAGE_NAME }}" \
--file "${{ env.IMAGE_NAME }}" \
openstack image create "${{ matrix.build.image_name }}" \
--file "${{ matrix.build.image_name }}" \
--disk-format qcow2 \
shell: bash
- name: Delete old latest image from target cloud
run: |
. venv/bin/activate
export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml
IMAGE_COUNT=$(openstack image list --name ${{ env.IMAGE_NAME }} -f value -c ID | wc -l)
IMAGE_COUNT=$(openstack image list --name ${{ matrix.build.image_name }} -f value -c ID | wc -l)
if [ "$IMAGE_COUNT" -gt 1 ]; then
OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ env.IMAGE_NAME }}" -f value -c ID | head -n 1)
OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ matrix.build.image_name }}" -f value -c ID | head -n 1)
openstack image delete "$OLD_IMAGE_ID"
else
echo "Only one image exists, skipping deletion."
fi
shell: bash
Loading

0 comments on commit ec81869

Please sign in to comment.