diff --git a/.github/workflows/extra.yml b/.github/workflows/extra.yml new file mode 100644 index 000000000..bf438c336 --- /dev/null +++ b/.github/workflows/extra.yml @@ -0,0 +1,139 @@ +name: Test extra build +on: + workflow_dispatch: + push: + branches: + - main + paths: + - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' + - 'ansible/roles/cuda/**' + - 'ansible/roles/lustre/**' + - '.github/workflows/extra.yml' + pull_request: + paths: + - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' + - 'ansible/roles/cuda/**' + - 'ansible/roles/lustre/**' + - '.github/workflows/extra.yml' + +jobs: + doca: + name: extra-build + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS + cancel-in-progress: true + runs-on: ubuntu-22.04 + strategy: + fail-fast: false # allow other matrix jobs to continue even if one fails + matrix: # build RL8, RL9 + build: + - image_name: openhpc-extra-RL8 + source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json + inventory_groups: doca,cuda,lustre + volume_size: 30 # needed for cuda + - image_name: openhpc-extra-RL9 + source_image_name_key: RL9 + inventory_groups: doca,cuda,lustre + volume_size: 30 # needed for cuda + env: + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack + CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} + + steps: + - uses: actions/checkout@v2 + + - name: Load current fat images into GITHUB_ENV + # see https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#example-of-a-multiline-string + run: | + { + echo 'FAT_IMAGES<> "$GITHUB_ENV" + + - name: Record settings + run: | + echo CI_CLOUD: ${{ env.CI_CLOUD }} + echo FAT_IMAGES: ${FAT_IMAGES} + + - name: Setup ssh + run: | + set -x + mkdir ~/.ssh + echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + shell: bash + + - name: Add bastion's ssh key to known_hosts + run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts + shell: bash + + - name: Install ansible etc + run: dev/setup-env.sh + + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml + shell: bash + + - name: Setup environment + run: | + . venv/bin/activate + . environments/.stackhpc/activate + + - name: Build fat image with packer + id: packer_build + run: | + set -x + . venv/bin/activate + . environments/.stackhpc/activate + cd packer/ + packer init . + + PACKER_LOG=1 packer build \ + -on-error=${{ vars.PACKER_ON_ERROR }} \ + -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ + -var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ + -var "volume_size=${{ matrix.build.volume_size }}" \ + openstack.pkr.hcl + + - name: Get created image names from manifest + id: manifest + run: | + . venv/bin/activate + IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json) + while ! openstack image show -f value -c name $IMAGE_ID; do + sleep 5 + done + IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" + echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" + echo $IMAGE_ID > image-id.txt + echo $IMAGE_NAME > image-name.txt + + - name: Make image usable for further builds + run: | + . venv/bin/activate + openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" + + - name: Delete image for automatically-run workflows + run: | + . venv/bin/activate + openstack image delete "${{ steps.manifest.outputs.image-id }}" + if: ${{ github.event_name != 'workflow_dispatch' }} + + - name: Upload manifest artifact + uses: actions/upload-artifact@v4 + with: + name: image-details-${{ matrix.build.image_name }} + path: | + ./image-id.txt + ./image-name.txt + overwrite: true diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index a8d3dbe29..c3b91fefa 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -15,30 +15,25 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 - os_version: - - RL8 - - RL9 build: - - openstack.openhpc + - image_name: openhpc-RL8 + source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.qcow2 + inventory_groups: control,compute,login,update + - image_name: openhpc-RL9 + source_image_name: Rocky-9-GenericCloud-Base-9.5-20241118.0.x86_64.qcow2 + inventory_groups: control,compute,login,update env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud }} - SOURCE_IMAGES_MAP: | - { - "RL8": { - "openstack.openhpc": "rocky-latest-RL8" - }, - "RL9": { - "openstack.openhpc": "rocky-latest-RL9" - } - } + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} + LEAFCLOUD_PULP_PASSWORD: ${{ secrets.LEAFCLOUD_PULP_PASSWORD }} steps: - uses: actions/checkout@v2 @@ -84,13 +79,11 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "source_image_name=${{ matrix.build.source_image_name }}" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl - env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }} - name: Get created image names from manifest id: manifest @@ -101,13 +94,20 @@ jobs: sleep 5 done IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" + echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" echo $IMAGE_ID > image-id.txt echo $IMAGE_NAME > image-name.txt + - name: Make image usable for further builds + run: | + . venv/bin/activate + openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" + - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: - name: image-details-${{ matrix.build }}-${{ matrix.os_version }} + name: image-details-${{ matrix.build.image_name }} path: | ./image-id.txt ./image-name.txt diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index f76bd51a9..0f7156fad 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -67,11 +67,31 @@ jobs: for cluster_prefix in ${ci_clusters} do echo "Processing cluster: $cluster_prefix" - TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value) - if [[ $TAGS =~ "keep" ]]; then - echo "Skipping ${cluster_prefix} - control instance is tagged as keep" + # Get all servers with the matching name for control node + CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json) + SERVER_COUNT=$(echo "$CONTROL_SERVERS" | jq length) + + if [[ $SERVER_COUNT -gt 1 ]]; then + echo "Multiple servers found for control node '${cluster_prefix}-control'. Checking tags for each..." + + for server in $(echo "$CONTROL_SERVERS" | jq -r '.[].ID'); do + # Get tags for each control node + TAGS=$(openstack server show "$server" --column tags --format value) + + if [[ $TAGS =~ "keep" ]]; then + echo "Skipping ${cluster_prefix} (server ${server}) - control instance is tagged as keep" + else + ./dev/delete-cluster.py ${cluster_prefix} --force + fi + done else - ./dev/delete-cluster.py ${cluster_prefix} --force + # If only one server, extract its tags and proceed + TAGS=$(echo "$CONTROL_SERVERS" | jq -r '.[0].Tags') + if [[ $TAGS =~ "keep" ]]; then + echo "Skipping ${cluster_prefix} - control instance is tagged as keep" + else + ./dev/delete-cluster.py ${cluster_prefix} --force + fi fi done shell: bash diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index da3de4ea5..ec920ce8d 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -11,32 +11,31 @@ on: - SMS - ARCUS schedule: - - cron: '0 0 * * *' # Run at midnight + - cron: '0 0 * * *' # Run at midnight on default branch jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 - os_version: - - RL8 - - RL9 build: - - openstack.rocky-latest + - image_name: rocky-latest-RL8 + source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.qcow2 + inventory_groups: update + - image_name: rocky-latest-RL9 + source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: update env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} - SOURCE_IMAGES_MAP: | - { - "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2", - "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" - } + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} + LEAFCLOUD_PULP_PASSWORD: ${{ secrets.LEAFCLOUD_PULP_PASSWORD }} steps: - uses: actions/checkout@v2 @@ -82,15 +81,13 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "source_image_name=${{ matrix.build.source_image_name }}" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "image_name_version=" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl - env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version] }} - - name: Get created image names from manifest id: manifest run: | @@ -124,7 +121,7 @@ jobs: name: upload-nightly-targets needs: openstack concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.image }}-${{ matrix.target_cloud }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }}-${{ matrix.target_cloud }} cancel-in-progress: true runs-on: ubuntu-22.04 strategy: @@ -134,18 +131,15 @@ jobs: - LEAFCLOUD - SMS - ARCUS - os_version: - - RL8 - - RL9 - image: - - rocky-latest + build: + - image_name: rocky-latest-RL8 + - image_name: rocky-latest-RL9 exclude: - target_cloud: LEAFCLOUD env: OS_CLOUD: openstack SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} TARGET_CLOUD: ${{ matrix.target_cloud }} - IMAGE_NAME: "${{ matrix.image }}-${{ matrix.os_version }}" steps: - uses: actions/checkout@v2 @@ -160,42 +154,37 @@ jobs: . venv/bin/activate pip install -U pip pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) - shell: bash - name: Write clouds.yaml run: | mkdir -p ~/.config/openstack/ echo "${{ secrets[format('{0}_CLOUDS_YAML', env.SOURCE_CLOUD)] }}" > ~/.config/openstack/source_clouds.yaml echo "${{ secrets[format('{0}_CLOUDS_YAML', env.TARGET_CLOUD)] }}" > ~/.config/openstack/target_clouds.yaml - shell: bash - name: Download source image run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/source_clouds.yaml - openstack image save --file ${{ env.IMAGE_NAME }} ${{ env.IMAGE_NAME }} - shell: bash + openstack image save --file ${{ matrix.build.image_name }} ${{ matrix.build.image_name }} - name: Upload to target cloud run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - openstack image create "${{ env.IMAGE_NAME }}" \ - --file "${{ env.IMAGE_NAME }}" \ + openstack image create "${{ matrix.build.image_name }}" \ + --file "${{ matrix.build.image_name }}" \ --disk-format qcow2 \ - shell: bash - name: Delete old latest image from target cloud run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - IMAGE_COUNT=$(openstack image list --name ${{ env.IMAGE_NAME }} -f value -c ID | wc -l) + IMAGE_COUNT=$(openstack image list --name ${{ matrix.build.image_name }} -f value -c ID | wc -l) if [ "$IMAGE_COUNT" -gt 1 ]; then - OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ env.IMAGE_NAME }}" -f value -c ID | head -n 1) + OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ matrix.build.image_name }}" -f value -c ID | head -n 1) openstack image delete "$OLD_IMAGE_ID" else echo "Only one image exists, skipping deletion." fi - shell: bash diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b08854adb..d5bd313ca 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -99,9 +99,9 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible-playbook ansible/adhoc/generate-passwords.yml - echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml + echo vault_demo_user_password: "$DEMO_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml env: - TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} + DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - name: Provision nodes using fat image id: provision_servers @@ -163,40 +163,28 @@ jobs: --spider \ --server-response \ --no-check-certificate \ - --http-user=testuser \ - --http-password=${TESTUSER_PASSWORD} https://${openondemand_servername} \ + --http-user=demo_user \ + --http-password=${DEMO_USER_PASSWORD} https://${openondemand_servername} \ 2>&1) (echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1) env: - TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - - # - name: Build environment-specific compute image - # id: packer_build - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # cd packer/ - # packer init - # PACKER_LOG=1 packer build -except openstack.fatimage -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - # ../dev/output_manifest.py packer-manifest.json # Sets NEW_COMPUTE_IMAGE_ID outputs - - # - name: Test reimage of compute nodes to new environment-specific image (via slurm) - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]" - # ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down - # ansible-playbook -v ansible/ci/check_slurm.yml + DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - name: Test reimage of login and control nodes (via rebuild adhoc) run: | . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml - ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml + - name: Test reimage of compute nodes and compute-init (via rebuild adhoc) + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml + ansible-playbook -v ansible/ci/check_slurm.yml + - name: Check sacct state survived reimage run: | . venv/bin/activate diff --git a/.github/workflows/trivyscan.yml b/.github/workflows/trivyscan.yml index 4c090b85a..5b65baca1 100644 --- a/.github/workflows/trivyscan.yml +++ b/.github/workflows/trivyscan.yml @@ -25,6 +25,20 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Override CI_CLOUD if PR label is present + if: ${{ github.event_name == 'pull_request' }} + run: | + # Iterate over the labels + labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name') + echo $labels + for label in $labels; do + if [[ $label == CI_CLOUD=* ]]; then + # Extract the value after 'CI_CLOUD=' + CI_CLOUD_OVERRIDE=${label#CI_CLOUD=} + echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV + fi + done + - name: Record settings for CI cloud run: | echo CI_CLOUD: ${{ env.CI_CLOUD }} diff --git a/README.md b/README.md index f61bf8df4..54b74d799 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,10 @@ This repository contains playbooks and configuration to define a Slurm-based HPC - [Rocky Linux](https://rockylinux.org/)-based hosts. - [OpenTofu](https://opentofu.org/) configurations to define the cluster's infrastructure-as-code. - Packages for Slurm and MPI software stacks from [OpenHPC](https://openhpc.community/). -- Shared fileystem(s) using NFS (with in-cluster or external servers) or [CephFS](https://docs.ceph.com/en/latest/cephfs/) via [Openstack Manila](https://wiki.openstack.org/wiki/Manila). +- Shared fileystem(s) using NFS (with in-cluster or external servers) or [CephFS](https://docs.ceph.com/en/latest/cephfs/) via [OpenStack Manila](https://wiki.openstack.org/wiki/Manila). - Slurm accounting using a MySQL database. - Monitoring integrated with Slurm jobs using Prometheus, ElasticSearch and Grafana. -- A web-based portal from [OpenOndemand](https://openondemand.org/). +- A web-based portal from [Open OnDemand](https://openondemand.org/). - Production-ready default Slurm configurations for access and memory limits. - [Packer](https://developer.hashicorp.com/packer)-based image build configurations for node images. @@ -25,17 +25,18 @@ The default configuration in this repository may be used to create a cluster to - Persistent state backed by an OpenStack volume. - NFS-based shared file system backed by another OpenStack volume. -Note that the OpenOndemand portal and its remote apps are not usable with this default configuration. +Note that the Open OnDemand portal and its remote apps are not usable with this default configuration. It requires an OpenStack cloud, and an Ansible "deploy host" with access to that cloud. Before starting ensure that: - You have root access on the deploy host. - You can create instances using a Rocky 9 GenericCloud image (or an image based on that). - - **NB**: In general it is recommended to use the [latest released image](https://github.com/stackhpc/ansible-slurm-appliance/releases) which already contains the required packages. This is built and tested in StackHPC's CI. However the appliance will install the necessary packages if a GenericCloud image is used. -- You have a SSH keypair defined in OpenStack, with the private key available on the deploy host. + - **NB**: In general it is recommended to use the [latest released image](https://github.com/stackhpc/ansible-slurm-appliance/releases) which already contains the required packages. This is built and tested in StackHPC's CI. +- You have an SSH keypair defined in OpenStack, with the private key available on the deploy host. - Created instances have access to internet (note proxies can be setup through the appliance if necessary). - Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance). +- Three security groups are present: ``default`` allowing intra-cluster communication, ``SSH`` allowing external access via SSH and ``HTTPS`` allowing access for Open OnDemand. ### Setup deploy host @@ -66,10 +67,11 @@ Use the `cookiecutter` template to create a new environment to hold your configu and follow the prompts to complete the environment name and description. -**NB:** In subsequent sections this new environment is refered to as `$ENV`. +**NB:** In subsequent sections this new environment is referred to as `$ENV`. -Activate the new environment: +Go back to the root folder and activate the new environment: + cd .. . environments/$ENV/activate And generate secrets for it: @@ -104,6 +106,7 @@ To deploy this infrastructure, ensure the venv and the environment are [activate export OS_CLOUD=openstack cd environments/$ENV/terraform/ + tofu init tofu apply and follow the prompts. Note the OS_CLOUD environment variable assumes that OpenStack credentials are defined using a [clouds.yaml](https://docs.openstack.org/python-openstackclient/latest/configuration/index.html#clouds-yaml) file in a default location with the default cloud name of `openstack`. @@ -123,8 +126,8 @@ where the IP of the login node is given in `environments/$ENV/inventory/hosts.ym ## Overview of directory structure - `environments/`: See [docs/environments.md](docs/environments.md). -- `ansible/`: Contains the ansible playbooks to configure the infrastruture. -- `packer/`: Contains automation to use Packer to build machine images for an enviromment - see the README in this directory for further information. +- `ansible/`: Contains the ansible playbooks to configure the infrastructure. +- `packer/`: Contains automation to use Packer to build machine images for an environment - see the README in this directory for further information. - `dev/`: Contains development tools. For further information see the [docs](docs/) directory. diff --git a/ansible/.gitignore b/ansible/.gitignore index 8edcc4360..1cabb8ad8 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -58,9 +58,21 @@ roles/* !roles/squid/** !roles/tuned/ !roles/tuned/** +!roles/sssd/ +!roles/sssd/** +!roles/sshd/ +!roles/sshd/** +!roles/compute_init/ +!roles/compute_init/** !roles/k3s/ !roles/k3s/** !roles/k9s/ !roles/k9s/** !roles/lustre/ !roles/lustre/** +!roles/dnf_repos/ +!roles/dnf_repos/** +!roles/pulp_site/ +!roles/pulp_site/** +!roles/doca/ +!roles/doca/** diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml new file mode 100644 index 000000000..2858d032b --- /dev/null +++ b/ansible/adhoc/deploy-pulp.yml @@ -0,0 +1,26 @@ +# Usage: ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server=" + +- name: Add temporary pulp server host + hosts: localhost + tasks: + - ansible.builtin.add_host: + name: "{{ pulp_server }}" + group: "_pulp_host" + +- name: Install pulp on server and add to config + become: yes + hosts: _pulp_host + tasks: + - name: Install pulp + ansible.builtin.include_role: + name: pulp_site + tasks_from: install.yml + public: true + + - name: Print Pulp endpoint + become: no + debug: + msg: | + Server configured, override 'appliances_pulp_url' with + appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" + in your environments diff --git a/ansible/adhoc/sync-pulp.yml b/ansible/adhoc/sync-pulp.yml new file mode 100644 index 000000000..b2cd9a8c4 --- /dev/null +++ b/ansible/adhoc/sync-pulp.yml @@ -0,0 +1,10 @@ +- hosts: localhost + tasks: + - ansible.builtin.include_role: + name: pulp_site + tasks_from: sync.yml + vars: + pulp_site_target_arch: "x86_64" + pulp_site_target_distribution: "rocky" + pulp_site_target_distribution_version: "9.5" + pulp_site_target_distribution_version_major: "9" diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 733d4b3f8..88d9274b3 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -110,6 +110,28 @@ policy: "{{ selinux_policy }}" register: sestatus +- hosts: sshd + tags: sshd + gather_facts: no + become: yes + tasks: + - name: Configure sshd + import_role: + name: sshd + +- hosts: dnf_repos + become: yes + tasks: + - name: Check that creds won't be leaked to users + ansible.builtin.assert: + that: dnf_repos_password is undefined + fail_msg: Passwords should not be templated into repofiles during configure, unset 'dnf_repos_password' + when: appliances_mode == 'configure' + - name: Replace system repos with pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: set_repos.yml + # --- tasks after here require access to package repos --- - hosts: squid tags: squid diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml index d95c5bb5c..6507caf08 100644 --- a/ansible/ci/check_slurm.yml +++ b/ansible/ci/check_slurm.yml @@ -6,9 +6,9 @@ shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name register: sinfo changed_when: false - until: "'boot' not in sinfo.stdout_lines" - retries: 5 - delay: 10 + until: not ("boot" in sinfo.stdout or "idle*" in sinfo.stdout) + retries: 10 + delay: 5 - name: Check nodes have expected slurm state assert: that: sinfo.stdout_lines == expected_sinfo diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index cf9b0bdab..670a99b29 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -61,5 +61,9 @@ os: "{{ ansible_distribution }} {{ ansible_distribution_version }}" kernel: "{{ ansible_kernel }}" ofed: "{{ ansible_facts.packages['mlnx-ofa_kernel'].0.version | default('-') }}" + doca: "{{ ansible_facts.packages[doca_profile | default('doca-ofed') ].0.version | default('-') }}" cuda: "{{ ansible_facts.packages['cuda'].0.version | default('-') }}" slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}" + +- name: Show image summary + command: cat /var/lib/image/image.json diff --git a/ansible/disable-repos.yml b/ansible/disable-repos.yml new file mode 100644 index 000000000..3e8022965 --- /dev/null +++ b/ansible/disable-repos.yml @@ -0,0 +1,7 @@ +- hosts: dnf_repos + become: yes + tasks: + - name: Disable pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: disable_repos.yml diff --git a/ansible/extras.yml b/ansible/extras.yml index 107f85252..72c76b3b1 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -24,8 +24,9 @@ gather_facts: yes tags: cuda tasks: - - import_role: + - include_role: name: cuda + tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}" - name: Persist hostkeys across rebuilds # Must be after filesystems.yml (for storage) @@ -37,6 +38,17 @@ - import_role: name: persist_hostkeys + +- name: Setup NFS export for compute node configuration + hosts: compute_init:!builder + # NB: has to be after eeesi and os-manila-mount + tags: compute_init + become: yes + tasks: + - include_role: + name: compute_init + tasks_from: export.yml + - name: Install k9s become: yes hosts: k9s @@ -44,3 +56,12 @@ tasks: - import_role: name: k9s + +- hosts: extra_packages + become: yes + tags: + - extra_packages + tasks: + - name: Install additional packages + dnf: + name: "{{ appliances_extra_packages }}" diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 7cad2dc59..e5de38edf 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -6,6 +6,9 @@ tasks: - name: Report hostname (= final image name) command: hostname + - name: Report inventory groups + debug: + var: group_names - name: Run pre.yml hook vars: @@ -14,8 +17,26 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists +- name: Sync pulp repos with upstream + hosts: pulp + tasks: + - ansible.builtin.include_role: + name: pulp_site + tasks_from: sync.yml + apply: + delegate_to: localhost + when: appliances_mode != 'configure' + - import_playbook: bootstrap.yml +- hosts: doca + become: yes + gather_facts: yes + tasks: + - name: Install NVIDIA DOCA + import_role: + name: doca + - name: Run post-bootstrap.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" @@ -33,6 +54,11 @@ name: freeipa tasks_from: client-install.yml when: "'freeipa_client' in group_names" + - name: Install sssd + import_role: + name: sssd + tasks_from: install.yml + when: "'sssd' in group_names" # - import_playbook: filesystems.yml: - name: Install nfs packages @@ -52,6 +78,16 @@ - import_playbook: extras.yml +# TODO: is this the right place? +- name: Install compute_init script + hosts: compute_init + tags: compute_init # tagged to allow running on cluster instances for dev + become: yes + tasks: + - include_role: + name: compute_init + tasks_from: install.yml + - hosts: builder become: yes gather_facts: yes @@ -98,6 +134,10 @@ tasks_from: jupyter_compute.yml when: "'openondemand_jupyter' in group_names" + - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build + yum: + name: mod_authnz_pam + # - import_playbook: monitoring.yml: - import_role: name: opensearch @@ -202,6 +242,8 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists +- import_playbook: disable-repos.yml + - hosts: builder become: yes gather_facts: yes diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index f69d6f3f7..508f794cc 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -41,6 +41,9 @@ def to_ood_regex(items): eg {{ [compute-001, compute-002, control] | to_regex }} -> '(compute-\d+)|(control)' """ + # NB: for python3.12+ the \d in this function & docstring + # need to be raw strings. See https://docs.python.org/3/reference/lexical_analysis.html + # There's a python bug which means re.sub() can't use '\d' in the replacement so # have to do replacement in two stages: r = [re.sub(r"\d+", 'XBACKSLASHX', v) for v in items] @@ -48,6 +51,11 @@ def to_ood_regex(items): r = ['(%s)' % v for v in r] return '|'.join(r) +def appliances_repo_to_subpath(repo_entry): + """ Take an element from appliances_pulp_repos and convert it to a pulp path. This assumes that the remote and local pulp structures are the same + """ + return repo_entry['path'] + '/' + repo_entry['timestamp'] + class FilterModule(object): ''' Ansible core jinja2 filters ''' @@ -63,4 +71,5 @@ def filters(self): 'exists': exists, 'warn': self.warn, 'to_ood_regex': to_ood_regex, + 'appliances_repo_to_subpath': appliances_repo_to_subpath } diff --git a/ansible/iam.yml b/ansible/iam.yml index 0286b9df3..857b8f840 100644 --- a/ansible/iam.yml +++ b/ansible/iam.yml @@ -40,3 +40,12 @@ import_role: name: freeipa tasks_from: users.yml + +- hosts: sssd + become: yes + gather_facts: no + tags: sssd + tasks: + - name: Configure sssd + import_role: + name: sssd diff --git a/ansible/roles/basic_users/README.md b/ansible/roles/basic_users/README.md index 4b75100ca..65fdd2c4c 100644 --- a/ansible/roles/basic_users/README.md +++ b/ansible/roles/basic_users/README.md @@ -24,6 +24,7 @@ Role Variables - An additional key `sudo` may optionally be specified giving a string (possibly multiline) defining sudo rules to be templated. - Any other keys may present for other purposes (i.e. not used by this role). - `basic_users_groups`: Optional, default empty list. A list of mappings defining information for each group. Mapping keys/values are passed through as parameters to [ansible.builtin.group](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/group_module.html) and default values are as given there. +- `basic_users_override_sssd`: Optional bool, default false. Whether to disable `sssd` when ensuring users/groups exist with this role. Permits creating local users/groups even if they clash with users provided via sssd (e.g. from LDAP). Ignored if host is not in group `sssd` as well. Note with this option active `sssd` will be stopped and restarted each time this role is run. Dependencies ------------ diff --git a/ansible/roles/basic_users/defaults/main.yml b/ansible/roles/basic_users/defaults/main.yml index 9f34bdf4c..e6c6eafaa 100644 --- a/ansible/roles/basic_users/defaults/main.yml +++ b/ansible/roles/basic_users/defaults/main.yml @@ -7,3 +7,4 @@ basic_users_userdefaults: shell: "{{'/sbin/nologin' if 'control' in group_names else omit }}" basic_users_users: [] basic_users_groups: [] +basic_users_override_sssd: false diff --git a/ansible/roles/basic_users/tasks/main.yml b/ansible/roles/basic_users/tasks/main.yml index c27d024b4..c6733fb89 100644 --- a/ansible/roles/basic_users/tasks/main.yml +++ b/ansible/roles/basic_users/tasks/main.yml @@ -7,7 +7,16 @@ label: "{{ item.name }}" when: - "item.state | default('present') == 'absent'" - + +- name: Stop sssd if required + systemd: + name: sssd + state: stopped + register: _stop_sssd + when: + - "'sssd' in group_names" + - basic_users_override_sssd | bool + - name: Create groups ansible.builtin.group: "{{ item }}" loop: "{{ basic_users_groups }}" @@ -19,6 +28,12 @@ label: "{{ item.name }} [{{ item.state | default('present') }}]" register: basic_users_info +- name: Restart sssd if required + systemd: + name: sssd + state: started + when: _stop_sssd is changed + - name: Write supplied public key as authorized for SSH access authorized_key: user: "{{ item.name }}" diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index 453f01a7e..69d001105 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -399,7 +399,7 @@ resource "openstack_compute_instance_v2" "login" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} - k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 k3s_token = "{{ k3s_token }}" } } @@ -565,7 +565,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} - k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 k3s_token = "{{ k3s_token }}" } } diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md new file mode 100644 index 000000000..db18034aa --- /dev/null +++ b/ansible/roles/compute_init/README.md @@ -0,0 +1,133 @@ +# EXPERIMENTAL: compute-init + +Experimental / in-progress functionality to allow compute nodes to rejoin the +cluster after a reboot. + +To enable this add compute nodes (or a subset of them into) the `compute_init` +group. + +This works as follows: +1. During image build, an ansible-init playbook and supporting files +(e.g. templates, filters, etc) are installed. +2. Cluster instances are created as usual; the above compute-init playbook does +not run. +3. The `site.yml` playbook is run as usual to configure all the instances into +a cluster. In addition, with `compute-init` enabled, a `/exports/cluster` NFS +share is created on the control node containing: + - an /etc/hosts file for the cluster + - Hostvars for each compute node +4. On reboot of a compute node, ansible-init runs the compute-init playbook +which: + a. Checks whether the `enable_compute` metadata flag is set, and exits if + not. + b. Tries to mount the above `/exports/cluster` NFS share from the control + node, and exits if it cannot. + c. Configures itself using the exported hostvars, depending on the + `enable_*` flags set in metadata. + d. Issues an `scontrol` command to resume the node (because Slurm will + consider it as "unexpectedly rebooted"). + +The check in 4b. above is what prevents the compute-init script from trying +to configure the node before the services on the control node are available +(which requires running the site.yml playbook). + +The following roles/groups are currently fully functional: +- `resolv_conf`: all functionality +- `etc_hosts`: all functionality +- `nfs`: client functionality only +- `manila`: all functionality +- `basic_users`: all functionality, assumes home directory already exists on + shared storage +- `eessi`: all functionality, assumes `cvmfs_config` is the same on control + node and all compute nodes. +- `openhpc`: all functionality + +The above may be enabled by setting the compute_init_enable property on the +terraform compute variable. + +# Development/debugging + +To develop/debug changes to the compute script without actually having to build +a new image: + +1. Deploy a cluster using tofu and ansible/site.yml as normal. This will + additionally configure the control node to export compute hostvars over NFS. + Check the cluster is up. + +2. Reimage the compute nodes: + + ansible-playbook --limit compute ansible/adhoc/rebuild.yml + +3. Add metadata to a compute node e.g. via Horizon to turn on compute-init + playbook functionality. + +4. Fake an image build to deploy the compute-init playbook: + + ansible-playbook ansible/fatimage.yml --tags compute_init + + NB: This will also re-export the compute hostvars, as the nodes are not + in the builder group, which conveniently means any changes made to that + play also get picked up. + +5. Fake a reimage of compute to run ansible-init and the compute-init playbook: + + On compute node where metadata was added: + + [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init + [root@rl9-compute-0 rocky]# systemctl status ansible-init + + Use `systemctl status ansible-init` to view stdout/stderr from Ansible. + +Steps 4/5 can be repeated with changes to the compute script. If required, +reimage the compute node(s) first as in step 2 and/or add additional metadata +as in step 3. + + +# Design notes +- Duplicating code in roles into the `compute-init` script is unfortunate, but + does allow developing this functionality without wider changes to the + appliance. + +- In general, we don't want to rely on NFS export. So should e.g. copy files + from this mount ASAP in the compute-init script. TODO: + +- There are a couple of approaches to supporting existing roles using `compute-init`: + + 1. Control node copies files resulting from role into cluster exports, + compute-init copies to local disk. Only works if files are not host-specific + Examples: etc_hosts, eessi config? + + 2. Re-implement the role. Works if the role vars are not too complicated, + (else they all need to be duplicated in compute-init). Could also only + support certain subsets of role functionality or variables + Examples: resolv_conf, stackhpc.openhpc + +- Some variables are defined using hostvars from other nodes, which aren't + available v the current approach: + + ``` + [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml + "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", + "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", + "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", + "nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}", + "openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}", + "openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}", + "openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}", + "openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}", + "prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}", + "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" + ``` + + More generally, there is nothing to stop any group var depending on a + "{{ hostvars[] }}" interpolation ... + + Only `nfs_server_default` and `openhpc_slurm_control_host` are of concern + for compute nodes - both of these indirect via `api_address` to + `inventory_hostname`. This has been worked around by replacing this with + "{{ groups['control'] | first }}" which does result in the control node + inventory hostname when templating. + + Note that although `groups` is defined in the templated hostvars, when + the hostvars are loaded using `include_vars:` is is ignored as it is a + "magic variable" determined by ansible itself and cannot be set. diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml new file mode 100644 index 000000000..430e2cf65 --- /dev/null +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -0,0 +1,285 @@ +--- + +- name: Compute node initialisation + hosts: localhost + become: yes + vars: + os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" + server_node_ip: "{{ os_metadata.meta.control_address }}" + enable_compute: "{{ os_metadata.meta.compute | default(false) | bool }}" + enable_resolv_conf: "{{ os_metadata.meta.resolv_conf | default(false) | bool }}" + enable_etc_hosts: "{{ os_metadata.meta.etc_hosts | default(false) | bool }}" + enable_nfs: "{{ os_metadata.meta.nfs | default(false) | bool }}" + enable_manila: "{{ os_metadata.meta.manila | default(false) | bool }}" + enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}" + enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}" + + # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects + resolv_conf_nameservers: [] + + nfs_client_mnt_point: "/mnt" + nfs_client_mnt_options: + nfs_client_mnt_state: mounted + nfs_configurations: + nfs_enable: + clients: false + + # openhpc: no defaults required + + os_manila_mount_shares: [] + os_manila_mount_ceph_conf_path: /etc/ceph + os_manila_mount_state: mounted + os_manila_mount_opts: + - x-systemd.device-timeout=30 + - x-systemd.mount-timeout=30 + - noatime + - _netdev # prevents mount blocking early boot before networking available + - rw + + basic_users_groups: [] + basic_users_manage_homedir: false # homedir must already exist on shared filesystem + basic_users_userdefaults: + state: present + create_home: "{{ basic_users_manage_homedir }}" + generate_ssh_key: "{{ basic_users_manage_homedir }}" + ssh_key_comment: "{{ item.name }}" + basic_users_users: [] + + tasks: + - block: + - name: Report skipping initialization if not compute node + # meta: end_play produces no output + debug: + msg: "Skipping compute initialization: Metadata enable_compute is not true" + + - meta: end_play + when: not enable_compute + + - name: Ensure the mount directory exists + file: + path: /mnt/cluster + state: directory + owner: root + group: root + mode: u=rwX,go= # is sensitive + + - name: Mount /mnt/cluster + mount: + path: /mnt/cluster + src: "{{ server_node_ip }}:/exports/cluster" + fstype: nfs + opts: ro,sync + state: mounted + register: _mount_mnt_cluster + ignore_errors: true + # TODO: add some retries here? + + - block: + - name: Report skipping initialization if cannot mount nfs + # meta: end_play produces no output + debug: + msg: "Skipping compute initialization: Failed to mount /exports/cluster from control node {{ server_node_ip }}" + + - meta: end_play + when: _mount_mnt_cluster.failed + + - name: Load hostvars from NFS + # this is higher priority than vars block = normal ansible's hostvars + include_vars: + file: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" # can't use inventory_hostname + + # TODO: should /mnt/cluster now be UNMOUNTED to avoid future hang-ups? + + - name: Configure resolve.conf + block: + - name: Set nameservers in /etc/resolv.conf + ansible.builtin.template: + src: resolv.conf.j2 + dest: /etc/resolv.conf + owner: root + group: root + mode: u=rw,og=r + + - name: Disable NetworkManager control of resolv.conf + ansible.builtin.copy: + src: files/NetworkManager-dns-none.conf + dest: /etc/NetworkManager/conf.d/90-dns-none.conf + owner: root + group: root + mode: u=rw,og=r + register: _copy_nm_config + + - name: Reload NetworkManager + ansible.builtin.systemd: + name: NetworkManager + state: reloaded + when: _copy_nm_config.changed | default(false) + when: enable_resolv_conf + + - name: Copy cluster /etc/hosts + copy: + src: /mnt/cluster/hosts + dest: /etc/hosts + owner: root + group: root + mode: 0644 + when: enable_etc_hosts + + # NFS client mount + - name: If nfs-clients is present + include_tasks: tasks/nfs-clients.yml + when: + - enable_nfs + - nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool) + loop: "{{ nfs_configurations }}" + + - name: Manila mounts + block: + - name: Read manila share info from nfs file + include_vars: + file: /mnt/cluster/manila_share_info.yml + no_log: true # contains secrets + + - name: Ensure Ceph configuration directory exists + ansible.builtin.file: + path: "{{ os_manila_mount_ceph_conf_path }}" + state: directory + mode: "0755" + owner: root + group: root + + - name: Configure ceph.conf using os_manila_mount_host + ansible.builtin.template: + src: ceph.conf.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.conf" + owner: root + group: root + mode: "0600" + + - name: Ensure mount directory exists + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Write Ceph client keyring + ansible.builtin.template: + src: ceph.keyring.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.client.{{ item.share_user }}.keyring" + mode: "0600" + owner: root + group: root + loop: "{{ os_manila_mount_share_info }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Mount the Ceph share + ansible.posix.mount: + path: "{{ item[0].mount_path }}" + src: "{{ item[1].host }}:{{ item[1].export }}" + fstype: ceph + opts: "name={{ item[1].share_user }},{{ (item[0].mount_opts | default(os_manila_mount_opts)) | join(',') }}" + # NB share_user is looked up here in case of autodetection + state: "{{ item[0].mount_state | default(os_manila_mount_state) }}" + loop: "{{ os_manila_mount_shares | zip(os_manila_mount_share_info) }}" + loop_control: + label: "{{ item[0].share_name }}" + + - name: Ensure mounted directory has correct permissions + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] + when: + - enable_manila + - os_manila_mount_shares | length > 0 + + - name: Basic users + block: + - name: Create groups + ansible.builtin.group: "{{ item }}" + loop: "{{ basic_users_groups }}" + + - name: Create users + user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}" + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }} [{{ item.state | default('present') }}]" + register: basic_users_info + + - name: Write sudo rules + blockinfile: + path: /etc/sudoers.d/80-{{ item.name}}-user + block: "{{ item.sudo }}" + create: true + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }}" + when: "'sudo' in item" + when: enable_basic_users + + - name: EESSI + block: + - name: Copy cvmfs config + copy: + src: /mnt/cluster/cvmfs/default.local + dest: /etc/cvmfs/default.local + owner: root + group: root + mode: 0644 + + - name: Ensure CVMFS config is setup + command: + cmd: "cvmfs_config setup" + when: enable_eessi + + # NB: don't need conditional block on enable_compute as have already exited + # if not the case + - name: Write Munge key + copy: + content: "{{ openhpc_munge_key }}" + dest: "/etc/munge/munge.key" + owner: munge + group: munge + mode: 0400 + + - name: Set slurmctld location for configless operation + lineinfile: + path: /etc/sysconfig/slurmd + line: "SLURMD_OPTIONS='--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}'" + regexp: "^SLURMD_OPTIONS=" + create: yes + owner: root + group: root + mode: 0644 + + - name: Ensure Munge service state + service: + name: munge + enabled: true + state: started + + - name: Ensure slurmd service state + service: + name: slurmd + enabled: true + state: started + + - name: Ensure node is resumed + # TODO: consider if this is always safe for all job states? + command: scontrol update state=resume nodename={{ ansible_hostname }} + register: _scontrol_update + failed_when: + - _scontrol_update.rc > 0 + - "'slurm_update error: Invalid node state specified' not in _scontrol_update.stderr" diff --git a/ansible/roles/compute_init/tasks/export.yml b/ansible/roles/compute_init/tasks/export.yml new file mode 100644 index 000000000..12b648f6e --- /dev/null +++ b/ansible/roles/compute_init/tasks/export.yml @@ -0,0 +1,67 @@ +- name: Ensure the /exports/cluster directory exists + file: + path: /exports/cluster + state: directory + owner: root + group: root + mode: u=rwX,go= + run_once: true + delegate_to: "{{ groups['control'] | first }}" + +- name: Copy /etc/hosts to /exports/cluster + copy: + src: /etc/hosts + dest: /exports/cluster/hosts + owner: root + group: root + mode: u=rw,go= + remote_src: true + run_once: true + delegate_to: "{{ groups['control'] | first }}" + +- name: Create hostvars directory + file: + path: /exports/cluster/hostvars/{{ inventory_hostname }}/ + state: directory + mode: u=rwX,go= + # TODO: owner,mode,etc + delegate_to: "{{ groups['control'] | first }}" + +- name: Template out hostvars + template: + src: hostvars.yml.j2 + dest: /exports/cluster/hostvars/{{ inventory_hostname }}/hostvars.yml + mode: u=rw,go= + delegate_to: "{{ groups['control'] | first }}" + +- name: Copy manila share info to /exports/cluster + copy: + content: "{{ os_manila_mount_share_info_var | to_nice_yaml }}" + dest: /exports/cluster/manila_share_info.yml + run_once: true + delegate_to: "{{ groups['control'] | first }}" + when: os_manila_mount_share_info is defined + vars: + os_manila_mount_share_info_var: + os_manila_mount_share_info: "{{ os_manila_mount_share_info }}" + +- name: Ensure /exports/cluster/cvmfs directory exists + file: + path: /exports/cluster/cvmfs + state: directory + owner: root + group: root + mode: 0755 + run_once: true + delegate_to: "{{ groups['control'] | first }}" + +- name: Copy EESSI CVMFS config to /exports/cluster + copy: + src: /etc/cvmfs/default.local + dest: /exports/cluster/cvmfs/default.local + owner: root + group: root + mode: 0644 + remote_src: true + run_once: true + delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml new file mode 100644 index 000000000..bbcbf133f --- /dev/null +++ b/ansible/roles/compute_init/tasks/install.yml @@ -0,0 +1,53 @@ +--- + +- name: Ensure directories exist + file: + path: "/etc/ansible-init/playbooks/{{ item }}" + state: directory + owner: root + group: root + mode: 0755 + loop: + - templates + - files + - library + - filter_plugins + - tasks + +- name: Inject files from roles + copy: + src: '{{ item.src }}' + dest: '/etc/ansible-init/playbooks/{{ item.dest }}' + owner: root + group: root + mode: 0644 + loop: + - src: ../../resolv_conf/templates/resolv.conf.j2 + dest: templates/resolv.conf.j2 + - src: ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 + dest: templates/ceph.conf.j2 + - src: ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 + dest: templates/ceph.keyring.j2 + - src: ../../resolv_conf/files/NetworkManager-dns-none.conf + dest: files/NetworkManager-dns-none.conf + - src: ../../basic_users/filter_plugins/filter_keys.py + dest: filter_plugins/filter_keys.py + - src: ../../stackhpc.nfs/tasks/nfs-clients.yml + dest: tasks/nfs-clients.yml + +- name: Add filter_plugins to ansible.cfg + lineinfile: + path: /etc/ansible-init/ansible.cfg + line: "filter_plugins = /etc/ansible-init/filter_plugins" + state: present + owner: root + group: root + mode: 0644 + +- name: Add compute initialisation playbook + copy: + src: compute-init.yml + dest: /etc/ansible-init/playbooks/1-compute-init.yml + owner: root + group: root + mode: 0644 diff --git a/ansible/roles/compute_init/templates/hostvars.yml.j2 b/ansible/roles/compute_init/templates/hostvars.yml.j2 new file mode 100644 index 000000000..7d4351b44 --- /dev/null +++ b/ansible/roles/compute_init/templates/hostvars.yml.j2 @@ -0,0 +1 @@ +{{ hostvars[inventory_hostname] | to_nice_json }} \ No newline at end of file diff --git a/ansible/roles/cuda/README.md b/ansible/roles/cuda/README.md index 141e7b80d..be6439cd5 100644 --- a/ansible/roles/cuda/README.md +++ b/ansible/roles/cuda/README.md @@ -1,6 +1,6 @@ # cuda -Install NVIDIA CUDA. The CUDA binaries are added to the PATH for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled. +Install NVIDIA drivers and optionally CUDA packages. CUDA binaries are added to the `$PATH` for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled. ## Prerequisites @@ -8,8 +8,8 @@ Requires OFED to be installed to provide required kernel-* packages. ## Role Variables -- `cuda_distro`: Optional. Default `rhel8`. -- `cuda_repo`: Optional. Default `https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo` -- `cuda_driver_stream`: Optional. The default value `default` will, on first use of this role, enable the dkms-flavour `nvidia-driver` DNF module stream with the current highest version number. The `latest-dkms` stream is not enabled, and subsequent runs of the role will *not* change the enabled stream, even if a later version has become available. Changing this value once an `nvidia-driver` stream has been enabled raises an error. If an upgrade of the `nvidia-driver` module is required, the currently-enabled stream and all packages should be manually removed. +- `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture. +- `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version. - `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`. +- `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA. - `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`. diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 33a25d9b4..05f1e093d 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,7 +1,6 @@ -cuda_distro: "rhel{{ ansible_distribution_major_version }}" -cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo" -cuda_driver_stream: default -cuda_package_version: 'latest' +cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" +cuda_nvidia_driver_stream: '560-open' # 565-open has problems with cuda packages +cuda_package_version: '12.6.3-1' cuda_packages: - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" - nvidia-gds diff --git a/ansible/roles/cuda/tasks/main.yml b/ansible/roles/cuda/tasks/install.yml similarity index 60% rename from ansible/roles/cuda/tasks/main.yml rename to ansible/roles/cuda/tasks/install.yml index 22f8e9e8e..51c92a0d3 100644 --- a/ansible/roles/cuda/tasks/main.yml +++ b/ansible/roles/cuda/tasks/install.yml @@ -1,7 +1,7 @@ # Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation -- name: Check for OFED +- name: Check for OFED/DOCA command: cmd: dnf list --installed rdma-core register: _dnf_rdma_core @@ -10,41 +10,53 @@ - name: Assert OFED installed assert: that: "'mlnx' in _dnf_rdma_core.stdout" - fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED installed?" + fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?" - name: Install cuda repo get_url: - dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo" - url: "{{ cuda_repo }}" + dest: "/etc/yum.repos.d/cuda-rhel{{ ansible_distribution_major_version }}.repo" + url: "{{ cuda_repo_url }}" - name: Check if nvidia driver module is enabled - shell: - cmd: dnf module list --enabled nvidia-driver + ansible.builtin.command: dnf module list --enabled nvidia-driver changed_when: false failed_when: false register: _cuda_driver_module_enabled - name: Enable nvidia driver module - ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms" + ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_nvidia_driver_stream }}" register: _cuda_driver_module_enable when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout" +- name: Check if nvidia driver module is installed + ansible.builtin.command: dnf module list --installed nvidia-driver + changed_when: false + failed_when: false + register: _cuda_driver_module_installed + - name: Install nvidia drivers ansible.builtin.command: dnf module install -y nvidia-driver register: _cuda_driver_install - when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" + when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr" changed_when: "'Nothing to do' not in _cuda_driver_install.stdout" +- name: Check kernel has not been modified + assert: + that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched + fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}" + - name: Install cuda packages ansible.builtin.dnf: name: "{{ cuda_packages }}" + when: cuda_package_version != 'none' register: cuda_package_install - name: Add cuda binaries to path lineinfile: path: /etc/profile.d/sh.local line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin' + when: cuda_package_version != 'none' - name: Enable NVIDIA Persistence Daemon systemd: @@ -60,3 +72,4 @@ - name: Wait for hosts to be reachable wait_for_connection: sleep: 15 + when: cuda_package_install.changed diff --git a/ansible/roles/cuda/tasks/runtime.yml b/ansible/roles/cuda/tasks/runtime.yml new file mode 100644 index 000000000..c16a48c6f --- /dev/null +++ b/ansible/roles/cuda/tasks/runtime.yml @@ -0,0 +1,5 @@ +- name: Ensure NVIDIA Persistence Daemon state + systemd: + name: nvidia-persistenced + enabled: true + state: "{{ cuda_persistenced_state }}" diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml new file mode 100644 index 000000000..6d41046ec --- /dev/null +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -0,0 +1,48 @@ +dnf_repos_pulp_content_url: "{{ appliances_pulp_url }}/pulp/content" +dnf_repos_username: "{{ omit }}" +dnf_repos_password: "{{ omit }}" + +dnf_repos_filenames: + '8': + baseos: 'Rocky-BaseOS' + appstream: 'Rocky-AppStream' + crb: 'Rocky-PowerTools' + extras: 'Rocky-Extras' + '9': + baseos: 'rocky' + appstream: 'rocky' + crb: 'rocky' + extras: 'rocky-extras' + +dnf_repos_version_filenames: "{{ dnf_repos_filenames[ansible_distribution_major_version] }}" + +# epel installed separately +dnf_repos_default_repolist: +- file: "{{ dnf_repos_version_filenames.baseos }}" + name: baseos + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.baseos[ansible_distribution_version] | appliances_repo_to_subpath }}" +- file: "{{ dnf_repos_version_filenames.appstream }}" + name: appstream + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.appstream[ansible_distribution_version] | appliances_repo_to_subpath }}" +- file: "{{ dnf_repos_version_filenames.crb }}" + name: "{{ 'powertools' if ansible_distribution_major_version == '8' else 'crb' }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.crb[ansible_distribution_version] | appliances_repo_to_subpath }}" +- file: "{{ dnf_repos_version_filenames.extras }}" + name: extras + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" +- file: ceph + name: Ceph + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.ceph[ansible_distribution_major_version] | appliances_repo_to_subpath }}" + +dnf_repos_openhpc_repolist: +- name: OpenHPC + file: OpenHPC + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_base[ansible_distribution_major_version] | appliances_repo_to_subpath }}" +- name: OpenHPC-updates + file: OpenHPC + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_updates[ansible_distribution_major_version] | appliances_repo_to_subpath }}" + +dnf_repos_repolist: "{{ dnf_repos_default_repolist + (dnf_repos_openhpc_repolist if (openhpc_install_type | default('ohpc')) == 'ohpc' else []) }}" + +dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.epel[ansible_distribution_major_version] | appliances_repo_to_subpath }}" +dnf_repos_epel_description: "epel" diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml new file mode 100644 index 000000000..2dbacc262 --- /dev/null +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -0,0 +1,18 @@ +--- +- name: Disable Pulp repos + ansible.builtin.yum_repository: + file: "{{ item.file }}" + name: "{{ item.name }}" + baseurl: "{{ item.base_url }}" + description: "{{ item.name }}" + enabled: false + loop: "{{ dnf_repos_repolist }}" + +- name: Disable EPEL repo + ansible.builtin.yum_repository: + name: epel + file: epel + description: "{{ dnf_repos_epel_description }}" + baseurl: "{{ dnf_repos_epel_baseurl }}" + gpgcheck: false + enabled: false diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml new file mode 100644 index 000000000..c9fcb0c07 --- /dev/null +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -0,0 +1,27 @@ +--- + +- name: Replace system repos with Pulp repos + ansible.builtin.yum_repository: + file: "{{ item.file }}" + name: "{{ item.name }}" + baseurl: "{{ item.base_url }}" + description: "{{ item.name }}" + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" + gpgcheck: false + loop: "{{ dnf_repos_repolist }}" + +- name: Install epel-release + # done so that roles installing epel via epel-release don't over-write our changes to the epel repo + ansible.builtin.dnf: + name: epel-release + +- name: Use Pulp EPEL repo + ansible.builtin.yum_repository: + name: epel + file: epel + description: "{{ dnf_repos_epel_description }}" + gpgcheck: false + baseurl: "{{ dnf_repos_epel_baseurl }}" + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" diff --git a/ansible/roles/doca/README.md b/ansible/roles/doca/README.md new file mode 100644 index 000000000..5f898add5 --- /dev/null +++ b/ansible/roles/doca/README.md @@ -0,0 +1,12 @@ +# doca + +Install [NVIDIA DOCA](https://docs.nvidia.com/doca/sdk/index.html). + +This role is not idempotent and is only intended to be run during an image build. It builds DOCA kernel modules to match the installed kernel and then installs these +plus the selected DOCA packages. + +## Role Variables + +- `doca_version`: Optional. String giving doca version. +- `doca_profile`: Optional. Name of [profile](https://docs.nvidia.com/doca/sdk/nvidia+doca+profiles/index.html) defining subset of DOCA to install. Default is `doca-ofed`. +- `doca_repo_url`: Optional. URL of DOCA repository. Default is appropriate upstream public repository for DOCA version, distro version and architecture. diff --git a/ansible/roles/doca/defaults/main.yml b/ansible/roles/doca/defaults/main.yml new file mode 100644 index 000000000..66437cd04 --- /dev/null +++ b/ansible/roles/doca/defaults/main.yml @@ -0,0 +1,3 @@ +doca_version: '2.9.1' # 2.9 is LTS, last to support ConnectX-4, 3 years for bug fixes and CVE updates +doca_profile: doca-ofed +doca_repo_url: "https://linux.mellanox.com/public/repo/doca/{{ doca_version }}/rhel{{ ansible_distribution_version }}/{{ ansible_architecture }}/" diff --git a/ansible/roles/doca/tasks/install-kernel-devel.yml b/ansible/roles/doca/tasks/install-kernel-devel.yml new file mode 100644 index 000000000..6a1943a32 --- /dev/null +++ b/ansible/roles/doca/tasks/install-kernel-devel.yml @@ -0,0 +1,24 @@ +- name: Get installed kernels + command: dnf list --installed kernel + register: _ofed_dnf_kernels + changed_when: false + +- name: Determine running kernel + command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64 + register: _ofed_loaded_kernel + changed_when: false + +- name: Check current kernel is newest installed + assert: + that: _ofed_kernel_current == _ofed_dnf_kernels_newest + fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" + vars: + _ofed_kernel_current: >- + {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} + _ofed_dnf_kernels_newest: >- + {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} + # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " + +- name: Install matching kernel-devel package + dnf: + name: "kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }}" diff --git a/ansible/roles/doca/tasks/install.yml b/ansible/roles/doca/tasks/install.yml new file mode 100644 index 000000000..d26fda79e --- /dev/null +++ b/ansible/roles/doca/tasks/install.yml @@ -0,0 +1,51 @@ +- import_tasks: install-kernel-devel.yml + +- name: Install DOCA repo + ansible.builtin.yum_repository: + name: doca + file: doca + description: DOCA Online Repo + baseurl: "{{ doca_repo_url }}" + enabled: true + gpgcheck: false + +- name: Install doca-extra package + ansible.builtin.dnf: + name: doca-extra + +- name: Build DOCA kernel modules + ansible.builtin.shell: + cmd: /opt/mellanox/doca/tools/doca-kernel-support + register: _doca_kernel_build + + +- name: Find generated doca-kernel-repo + ansible.builtin.shell: 'find /tmp/DOCA.* -name doca-kernel-repo-*' + register: _doca_kernel_repo # e.g. /tmp/DOCA.WVMchs2QWo/doca-kernel-repo-24.10.1.1.4.0-1.kver.5.14.0.427.31.1.el9.4.x86.64.x86_64.rpm + changed_when: false + +- name: Create dnf cache + ansible.builtin.command: dnf makecache + +- name: Install DOCA repository package + ansible.builtin.dnf: + name: "{{ _doca_kernel_repo.stdout }}" + disable_gpg_check: true + +- name: Install DOCA packages + ansible.builtin.dnf: + name: "{{ doca_profile }}" + +- name: Cleanup DOCA build directories + ansible.builtin.file: + state: absent + path: "{{ (_doca_kernel_repo.stdout | split('/'))[:3] | join('/') }}" # leading / means 1st element of split list is '' + +- name: Update initramfs + ansible.builtin.command: + cmd: dracut -f + register: _doca_dracut + failed_when: _doca_dracut.stderr != '' # appears rc is always 0 + +- name: Load the new driver + ansible.builtin.command: /etc/init.d/openibd restart diff --git a/ansible/roles/doca/tasks/main.yml b/ansible/roles/doca/tasks/main.yml new file mode 100644 index 000000000..e7a272f38 --- /dev/null +++ b/ansible/roles/doca/tasks/main.yml @@ -0,0 +1 @@ +- include_tasks: install.yml diff --git a/ansible/roles/eessi/tasks/main.yaml b/ansible/roles/eessi/tasks/main.yaml index d121b6fdd..c61625b0e 100644 --- a/ansible/roles/eessi/tasks/main.yaml +++ b/ansible/roles/eessi/tasks/main.yaml @@ -10,6 +10,7 @@ - name: Add CVMFS repo dnf: name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm + disable_gpg_check: true - name: Install CVMFS dnf: diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 8ee0e6114..b9b82f1c4 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -3,7 +3,7 @@ vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" k3s_token: "{{ os_metadata.meta.k3s_token }}" - k3s_server_name: "{{ os_metadata.meta.k3s_server }}" + k3s_server_name: "{{ os_metadata.meta.control_address }}" service_name: "{{ 'k3s-agent' if k3s_server_name is defined else 'k3s' }}" tasks: - name: Ensure password directory exists diff --git a/ansible/roles/lustre/README.md b/ansible/roles/lustre/README.md index c0a25e037..3ba0dad56 100644 --- a/ansible/roles/lustre/README.md +++ b/ansible/roles/lustre/README.md @@ -8,7 +8,7 @@ Install and configure a Lustre client. This builds RPM packages from source. ## Role Variables -- `lustre_version`: Optional str. Version of lustre to build, default `2.15.5` which is the first version with EL9 support +- `lustre_version`: Optional str. Version of lustre to build, default `2.15.6` which is the first version with EL9.5 support - `lustre_lnet_label`: Optional str. The "lnet label" part of the host's NID, e.g. `tcp0`. Only the `tcp` protocol type is currently supported. Default `tcp`. - `lustre_mgs_nid`: Required str. The NID(s) for the MGS, e.g. `192.168.227.11@tcp1` (separate mutiple MGS NIDs using `:`). - `lustre_mounts`: Required list. Define Lustre filesystems and mountpoints as a list of dicts with keys: diff --git a/ansible/roles/lustre/defaults/main.yml b/ansible/roles/lustre/defaults/main.yml index be008ad55..40389970c 100644 --- a/ansible/roles/lustre/defaults/main.yml +++ b/ansible/roles/lustre/defaults/main.yml @@ -1,4 +1,4 @@ -lustre_version: '2.15.5' # https://www.lustre.org/lustre-2-15-5-released/ +lustre_version: '2.15.6' # https://www.lustre.org/lustre-2-15-6-released/ lustre_lnet_label: tcp #lustre_mgs_nid: lustre_mounts: [] diff --git a/ansible/roles/lustre/tasks/install.yml b/ansible/roles/lustre/tasks/install.yml index e0af857cf..852b4652f 100644 --- a/ansible/roles/lustre/tasks/install.yml +++ b/ansible/roles/lustre/tasks/install.yml @@ -41,30 +41,9 @@ ansible.builtin.dnf: name: "{{ _lustre_find_rpms.files | map(attribute='path')}}" disable_gpg_check: yes - -- block: - - name: Remove lustre build prerequisites - # NB Only remove ones this role installed which weren't upgrades - ansible.builtin.dnf: - name: "{{ _new_pkgs }}" - state: absent - vars: - _installed_pkgs: | - {{ - _lustre_dnf_build_packages.results | - select('match', 'Installed:') | - map('regex_replace', '^Installed: (.+?)-[0-9].*$', '\1') - }} - _removed_pkgs: | - {{ - _lustre_dnf_build_packages.results | - select('match', 'Removed:') | - map('regex_replace', '^Removed: (.+?)-[0-9].*$', '\1') - }} - _new_pkgs: "{{ _installed_pkgs | difference(_removed_pkgs) }}" - - - name: Delete lustre build dir - file: - path: "{{ lustre_build_dir }}" - state: absent + +- name: Delete lustre build dir + file: + path: "{{ lustre_build_dir }}" + state: absent when: lustre_build_cleanup | bool diff --git a/ansible/roles/openondemand/README.md b/ansible/roles/openondemand/README.md index c6a4f3f9f..365265df0 100644 --- a/ansible/roles/openondemand/README.md +++ b/ansible/roles/openondemand/README.md @@ -17,7 +17,7 @@ This uses the [osc.ood](https://github.com/OSC/ood-ansible) Ansible role to prov ### General - `openondemand_clusters`: Required. Synonym for [osc.ood: clusters](https://github.com/OSC/ood-ansible#clusters) role variable. -- `openondemand_servername`: Required. Synonym for [osc.ood: servername](https://github.com/OSC/ood-ansible/blob/master/defaults/main/ood_portal.yml#L27) role variable. This defines what the Open Ondemand portal's Apache server uses for the [name-based virtual host](https://httpd.apache.org/docs/current/mod/core.html#servername). It should be the IP or hostname(+domain) part of the URL used to access Open Ondemand in the browser, e.g. `ondemand.mysite.org`. **NB:** If a domain or external IP is not available, specify the host's internal IP here and use ssh with a `DynamicForward` option and a SOCKS proxy to access this address. Using ssh's `LocalForward` option is not recommended as the server name will have to be `localhost` which causes some issues. +- `openondemand_servername`: Required. Synonym for [osc.ood: servername](https://github.com/OSC/ood-ansible/blob/master/defaults/main/ood_portal.yml#L27) role variable. This defines what the Open Ondemand portal's Apache server uses for the [name-based virtual host](https://httpd.apache.org/docs/current/mod/core.html#servername). It should be the IP or hostname(+domain) part of the URL used to access Open Ondemand in the browser, e.g. `ondemand.mysite.org`. **NB:** If a domain or external IP is not available, specify the host's internal IP here and use ssh with a `DynamicForward` option and a SOCKS proxy to access this address. Using ssh's `LocalForward` option is not recommended as the server name will have to be `localhost` which causes some issues. Changing this value on an already deployed cluster requires a reboot of the login node for OOD app state to be correctly refreshed. ### Authentication See the Open Ondemand [Authentication docs](https://osc.github.io/ood-documentation/latest/authentication/overview.html) for an overview of the authentication process. @@ -77,7 +77,7 @@ The Open Ondemand portal can proxy other servers. Variables: to proxy: - All "compute" nodes, e.g. for Open Ondemand interactive apps such as remote desktop and Jupyter notebook server. - The Grafana server - note a link to Grafana is always added to the Open Ondemand dashboard. - + The exact pattern depends on inventory hostnames / partitions / addresses. - `openondemand_node_proxy_directives`: Optional, default ''. Multiline string to insert into Apache directives definition for `node_uri` ([docs](https://osc.github.io/ood-documentation/master/reference/files/ood-portal-yml.html#configure-reverse-proxy)). diff --git a/ansible/roles/openondemand/tasks/main.yml b/ansible/roles/openondemand/tasks/main.yml index 86184f13c..bd5706ecb 100644 --- a/ansible/roles/openondemand/tasks/main.yml +++ b/ansible/roles/openondemand/tasks/main.yml @@ -6,12 +6,19 @@ loop: "{{ openondemand_osc_ood_defaults | dict2items }}" when: (item.key in hostvars[inventory_hostname]) or (item.value) +# osc.ood variables are exposed to play here instead of setting 'public' in include role so that they will still be exposed during runtime +- ansible.builtin.include_vars: + dir: "{{ playbook_dir }}/roles/osc.ood/defaults/main" + +- ansible.builtin.include_vars: + file: "{{ playbook_dir }}/roles/osc.ood/vars/Rocky/{{ ansible_distribution_major_version }}.yml" + # if using PAM auth we need apache installed but NOT started so split the osc.ood role up: - include_role: name: osc.ood tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" - public: yes # Expose the vars from this role to the rest of the play + when: appliances_mode != 'configure' # can't set vars: from a dict hence the workaround above - include_tasks: diff --git a/ansible/roles/openondemand/tasks/pam_auth.yml b/ansible/roles/openondemand/tasks/pam_auth.yml index 0edce622f..6bc4bda36 100644 --- a/ansible/roles/openondemand/tasks/pam_auth.yml +++ b/ansible/roles/openondemand/tasks/pam_auth.yml @@ -1,6 +1,6 @@ # https://osc.github.io/ood-documentation/latest/authentication/pam.html --- -- name: Install Apache PAM module +- name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build yum: name: mod_authnz_pam diff --git a/ansible/roles/openondemand/tasks/vnc_compute.yml b/ansible/roles/openondemand/tasks/vnc_compute.yml index 388e3b3c5..6ec340249 100644 --- a/ansible/roles/openondemand/tasks/vnc_compute.yml +++ b/ansible/roles/openondemand/tasks/vnc_compute.yml @@ -48,6 +48,7 @@ tags: install yum: name: '@Xfce' + when: appliances_mode != 'configure' # dnf group/module installs aren't idempotent so only run during build # - name: Ensure python3.9 installed # dnf: diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index d9a339efd..929aac465 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -9,6 +9,8 @@ slurm_appliance_secrets: vault_freeipa_ds_password: "{{ vault_freeipa_ds_password | default(lookup('password', '/dev/null')) }}" vault_freeipa_admin_password: "{{ vault_freeipa_admin_password | default(lookup('password', '/dev/null')) }}" vault_k3s_token: "{{ vault_k3s_token | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}" + vault_pulp_admin_password: "{{ vault_pulp_admin_password | default(lookup('password', '/dev/null', chars=['ascii_letters', 'digits'])) }}" + vault_demo_user_password: "{{ vault_demo_user_password | default(lookup('password', '/dev/null')) }}" secrets_openhpc_mungekey_default: content: "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') }}" diff --git a/ansible/roles/passwords/tasks/validate.yml b/ansible/roles/passwords/tasks/validate.yml index 9279ffdbf..b30b0696e 100644 --- a/ansible/roles/passwords/tasks/validate.yml +++ b/ansible/roles/passwords/tasks/validate.yml @@ -1,4 +1,4 @@ - name: Assert secrets created assert: - that: (hostvars[inventory_hostname].keys() | select('contains', 'vault_') | length) > 1 # 1 as may have vault_testuser_password defined in dev + that: (hostvars[inventory_hostname].keys() | select('contains', 'vault_') | length) > 1 # 1 as may have vault_demo_user_password defined in dev fail_msg: "No inventory variables 'vault_*' found: Has ansible/adhoc/generate-passwords.yml been run?" diff --git a/ansible/roles/persist_hostkeys/README.md b/ansible/roles/persist_hostkeys/README.md index 2d823dc36..6201a104b 100644 --- a/ansible/roles/persist_hostkeys/README.md +++ b/ansible/roles/persist_hostkeys/README.md @@ -1,8 +1,5 @@ # persist_hostkeys -Save hostkeys to persistent storage and restore them after a rebuild/reimage. +Idempotently generates a persistent set of hostkeys and restores them after a rebuild/reimage. -Add hosts to the `persist_hostkeys` group to enable. - -This role has no variables but hosts in this group must have `appliances_state_dir` -defined as a directory they can write to on persistent storage. +Add hosts to the `persist_hostkeys` group to enable. All hosts in group will share the same set hostkeys. diff --git a/ansible/roles/persist_hostkeys/defaults/main.yml b/ansible/roles/persist_hostkeys/defaults/main.yml new file mode 100644 index 000000000..3c0000466 --- /dev/null +++ b/ansible/roles/persist_hostkeys/defaults/main.yml @@ -0,0 +1,2 @@ +persist_hostkeys_state_server: "{{ groups['control'] | first }}" +persist_hostkeys_state_dir: "{{ hostvars[persist_hostkeys_state_server]['appliances_state_dir'] }}/hostkeys" diff --git a/ansible/roles/persist_hostkeys/tasks/main.yml b/ansible/roles/persist_hostkeys/tasks/main.yml index 47493220d..deff112f7 100644 --- a/ansible/roles/persist_hostkeys/tasks/main.yml +++ b/ansible/roles/persist_hostkeys/tasks/main.yml @@ -1,33 +1,47 @@ --- -- name: Ensure hostkeys directory exists on persistent storage - file: - path: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}" - state: directory - owner: root - group: root - mode: 0600 +- name: Generate persistent hostkeys in state directory + delegate_to: "{{ persist_hostkeys_state_server }}" + block: + - name: Ensure hostkeys directory exists on persistent storage + file: + path: "{{ persist_hostkeys_state_dir }}" + state: directory + owner: root + group: root + mode: 0600 -- name: Copy hostkeys from persistent storage - # won't fail if no keys are in persistent storage - copy: - src: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}/" - dest: /etc/ssh/ - remote_src: true + - name: Check for existing hostkeys + find: + paths: "{{ persist_hostkeys_state_dir }}/" + register: _files_found + + - name: Generate hostkeys + when: _files_found.matched == 0 + shell: + # ssh-keygen -A needs a directory with an /etc/ssh suffix to write hostkeys into + cmd: | + mkdir -p {{ persist_hostkeys_state_dir }}/etc/ssh + ssh-keygen -A -N '' -f {{ persist_hostkeys_state_dir }} + mv {{ persist_hostkeys_state_dir }}/etc/ssh/* {{ persist_hostkeys_state_dir }} + rm -rf {{ persist_hostkeys_state_dir }}/etc/ssh + + - name: Get created key names + find: + path: "{{ persist_hostkeys_state_dir }}/" + register: _find_ssh_keys -- name: Find hostkeys - find: - path: /etc/ssh/ - patterns: ssh_host_*_key* - register: _find_ssh_keys + - name: Create in-memory copies of keys + ansible.builtin.slurp: + src: "{{ item.path }}" + loop: "{{ _find_ssh_keys.files }}" + register: _slurp_keys -- name: Persist hostkeys +- name: Copy keys to hosts + no_log: true copy: - dest: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}/" - src: "{{ item }}" - remote_src: true - mode: preserve - loop: "{{ _find_ssh_keys.files | map(attribute='path') }}" + content: "{{ item.content | b64decode }}" + dest: "/etc/ssh/{{ item.source | regex_search('[^/]+$') }}" + loop: "{{ _slurp_keys.results }}" - meta: reset_connection - diff --git a/ansible/roles/podman/tasks/config.yml b/ansible/roles/podman/tasks/config.yml index 5fea3c2e0..74cf1d576 100644 --- a/ansible/roles/podman/tasks/config.yml +++ b/ansible/roles/podman/tasks/config.yml @@ -55,6 +55,7 @@ # Type Path Mode User Group Age Argument R! /tmp/containers-user-* R! /tmp/podman-run-* + R! /tmp/storage-run-* dest: /etc/tmpfiles.d/podman-local.conf owner: root group: root diff --git a/ansible/roles/pulp_site/.gitignore b/ansible/roles/pulp_site/.gitignore new file mode 100644 index 000000000..6738e49c1 --- /dev/null +++ b/ansible/roles/pulp_site/.gitignore @@ -0,0 +1 @@ +filter_plugins/__pycache__ \ No newline at end of file diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml new file mode 100644 index 000000000..c549dac53 --- /dev/null +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -0,0 +1,42 @@ +pulp_site_url: "{{ appliances_pulp_url }}" +pulp_site_port: 8080 +pulp_site_username: admin # shouldn't be changed +pulp_site_password: "{{ vault_pulp_admin_password }}" +pulp_site_upstream_content_url: https://ark.stackhpc.com/pulp/content +pulp_site_default_upstream_suffix: "{{ pulp_site_target_arch }}/os" +pulp_site_validate_certs: false +pulp_site_install_dir: '/home/rocky/pulp' +pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" +pulp_site_target_facts: "{{ hostvars[groups['pulp'][0]]['ansible_facts'] }}" +pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" +pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" + +pulp_site_rpm_info: +- name: "baseos-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.baseos[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.baseos[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" +- name: "appstream-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.appstream[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.appstream[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" +- name: "crb-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.crb[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.crb[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" +- name: "extras-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" +- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" +- name: "ohpc-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" +- name: "ohpc-updates-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" +- name: "ceph-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" + +pulp_site_rpm_repo_defaults: + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand + state: present + +_pulp_site_rpm_info_all: "{{ pulp_site_rpm_info | map('combine', pulp_site_rpm_repo_defaults) }}" + +pulp_site_rpm_repos: "{{ _pulp_site_rpm_info_all | to_rpm_repos(pulp_site_upstream_content_url) }}" +pulp_site_rpm_publications: "{{ _pulp_site_rpm_info_all | to_rpm_pubs }}" +pulp_site_rpm_distributions: "{{ _pulp_site_rpm_info_all | to_rpm_distros }}" diff --git a/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py new file mode 100644 index 000000000..50e912685 --- /dev/null +++ b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py @@ -0,0 +1,31 @@ +class FilterModule(object): + def filters(self): + return { + 'to_rpm_repos': self.to_rpm_repos, + 'to_rpm_pubs': self.to_rpm_pubs, + 'to_rpm_distros': self.to_rpm_distros + } + + def to_rpm_repos(self, list, pulp_url): + repo_list = map(lambda x: { + 'name': x['name'], + 'url': pulp_url+'/'+x['subpath'], + 'remote_username': x['remote_username'], + 'remote_password': x['remote_password'], + 'policy': x['policy'], + 'state': x['state'] }, list) + return repo_list + + def to_rpm_pubs(self, list): + pub_list = map(lambda x: { + 'repository': x['name'], + 'state': x['state'] }, list) + return pub_list + + def to_rpm_distros(self, list): + distro_list = map(lambda x: { + 'name': x['name'], + 'repository': x['name'], + 'base_path': x['subpath'], + 'state': x['state'] }, list) + return distro_list \ No newline at end of file diff --git a/ansible/roles/pulp_site/tasks/install.yml b/ansible/roles/pulp_site/tasks/install.yml new file mode 100644 index 000000000..39b4fcd97 --- /dev/null +++ b/ansible/roles/pulp_site/tasks/install.yml @@ -0,0 +1,43 @@ +--- + +- name: Install packages + dnf: + name: + - podman + +- name: Create install directories + ansible.builtin.file: + state: directory + path: "{{ pulp_site_install_dir }}/{{ item }}" + loop: + - settings/certs + - pulp_storage + - pgsql + - containers + +- name: Template settings file + ansible.builtin.template: + src: settings.py.j2 + dest: "{{ pulp_site_install_dir }}/settings/settings.py" + +- name: Install pulp podman container + containers.podman.podman_container: + name: pulp + publish: + - "{{ pulp_site_port }}:80" + volume: + - "{{ pulp_site_install_dir }}/settings:/etc/pulp{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/pulp_storage:/var/lib/pulp{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/pgsql:/var/lib/pgsql{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/containers:/var/lib/containers{{ pulp_site_selinux_suffix }}" + device: /dev/fuse + image: docker.io/pulp/pulp:3.68.1 + +- name: Reset admin password once container has initialised + no_log: true + ansible.builtin.shell: + cmd: "podman exec pulp bash -c 'pulpcore-manager reset-admin-password -p {{ pulp_site_password }}'" + register: _admin_reset_output + until: 0 == _admin_reset_output.rc + retries: 6 + delay: 30 diff --git a/ansible/roles/pulp_site/tasks/sync.yml b/ansible/roles/pulp_site/tasks/sync.yml new file mode 100644 index 000000000..5ef2bc5f1 --- /dev/null +++ b/ansible/roles/pulp_site/tasks/sync.yml @@ -0,0 +1,78 @@ +--- + +- ansible.builtin.assert: + that: pulp_site_upstream_password != '' + quiet: true + fail_msg: "Upstream password not set. Either set env var ARK_PASSWORD or override pulp_site_upstream_password." + +- name: Wait for Pulp server + pulp.squeezer.status: + pulp_url: "{{ pulp_site_url }}" + username: "{{ pulp_site_username }}" + password: "{{ pulp_site_password }}" + register: _pulp_status + until: _pulp_status.failed == false + retries: 30 + delay: 20 + +- name: Ensure Pulp CLI config directory exists + ansible.builtin.file: + path: ~/.config/pulp + state: directory + +- name: Create config file + no_log: true + ansible.builtin.template: + src: cli.toml.j2 + dest: ~/.config/pulp/cli.toml + mode: '0644' + +- block: + - name: Ensure squeezer cache exists + ansible.builtin.file: + path: "{{ _cache_dir }}" + state: directory + + - name: Check if squeezer cache is populated + ansible.builtin.stat: + path: "{{ _cache_dir }}/api.json" + register: _cache_stat + + - name: Prepopulate squeezer cache # workaround for race on the cache + ansible.builtin.get_url: + url: "{{ pulp_site_url }}/pulp/api/v3/docs/api.json" + dest: "{{ _cache_dir }}/api.json" + timeout: 40 + when: not _cache_stat.stat.exists + vars: + _cache_dir: "~/.cache/squeezer/{{ pulp_site_url | regex_replace( ':|/' , '_' ) }}" + +- name: Get Pulp repos from release train + ansible.builtin.include_role: + name: stackhpc.pulp.pulp_repository + tasks_from: rpm.yml + vars: + pulp_url: "{{ pulp_site_url }}" + pulp_username: "{{ pulp_site_username }}" + pulp_password: "{{ pulp_site_password }}" + pulp_repository_rpm_repos: "{{ pulp_site_rpm_repos }}" + +- name: Create Pulp publications + ansible.builtin.include_role: + name: stackhpc.pulp.pulp_publication + tasks_from: rpm.yml + vars: + pulp_url: "{{ pulp_site_url }}" + pulp_username: "{{ pulp_site_username }}" + pulp_password: "{{ pulp_site_password }}" + pulp_publication_rpm: "{{ pulp_site_rpm_publications }}" + +- name: Create Pulp distributions + ansible.builtin.include_role: + name: stackhpc.pulp.pulp_distribution + tasks_from: rpm.yml + vars: + pulp_url: "{{ pulp_site_url }}" + pulp_username: "{{ pulp_site_username }}" + pulp_password: "{{ pulp_site_password }}" + pulp_distribution_rpm: "{{ pulp_site_rpm_distributions }}" diff --git a/ansible/roles/pulp_site/templates/cli.toml.j2 b/ansible/roles/pulp_site/templates/cli.toml.j2 new file mode 100644 index 000000000..06867902f --- /dev/null +++ b/ansible/roles/pulp_site/templates/cli.toml.j2 @@ -0,0 +1,14 @@ +[cli] +base_url = "{{ pulp_site_url }}" +username = "{{ pulp_site_username }}" +password = "{{ pulp_site_password }}" +api_root = "/pulp/" +domain = "default" +headers = [] +cert = "" +key = "" +verify_ssl = true +format = "json" +dry_run = false +timeout = 0 +verbose = 0 diff --git a/ansible/roles/pulp_site/templates/settings.py.j2 b/ansible/roles/pulp_site/templates/settings.py.j2 new file mode 100644 index 000000000..200212e2c --- /dev/null +++ b/ansible/roles/pulp_site/templates/settings.py.j2 @@ -0,0 +1,2 @@ +CONTENT_ORIGIN='http://{{ ansible_fqdn }}:{{ pulp_site_port }}' +TOKEN_AUTH_DISABLED=True diff --git a/ansible/roles/sshd/README.md b/ansible/roles/sshd/README.md new file mode 100644 index 000000000..0fac1d189 --- /dev/null +++ b/ansible/roles/sshd/README.md @@ -0,0 +1,9 @@ +# sshd + +Configure sshd. + +## Role variables + +- `sshd_password_authentication`: Optional bool. Whether to enable password login. Default `false`. +- `sshd_conf_src`: Optional string. Path to sshd configuration template. Default is in-role template. +- `sshd_conf_dest`: Optional string. Path to destination for sshd configuration file. Default is `/etc/ssh/sshd_config.d/10-ansible.conf` which overides `50-{cloud-init,redhat}` files, if present. diff --git a/ansible/roles/sshd/defaults/main.yml b/ansible/roles/sshd/defaults/main.yml new file mode 100644 index 000000000..672305799 --- /dev/null +++ b/ansible/roles/sshd/defaults/main.yml @@ -0,0 +1,3 @@ +sshd_password_authentication: false +sshd_conf_src: sshd.conf.j2 +sshd_conf_dest: /etc/ssh/sshd_config.d/10-ansible.conf diff --git a/ansible/roles/sshd/handlers/main.yml b/ansible/roles/sshd/handlers/main.yml new file mode 100644 index 000000000..e11aa7801 --- /dev/null +++ b/ansible/roles/sshd/handlers/main.yml @@ -0,0 +1,4 @@ +- name: Restart sshd + systemd: + name: sshd + state: restarted diff --git a/ansible/roles/sshd/tasks/configure.yml b/ansible/roles/sshd/tasks/configure.yml new file mode 100644 index 000000000..8aafb5c19 --- /dev/null +++ b/ansible/roles/sshd/tasks/configure.yml @@ -0,0 +1,15 @@ +- name: Template sshd configuration + # NB: If parameters are defined multiple times the first value wins; + # The default /etc/ssh/sshd_config has + # Include /etc/ssh/sshd_config.d/*.conf + # early on, which is generally held to be the correct approach, so adding + # values to the end of that file won't work + template: + src: "{{ sshd_conf_src }}" + dest: "{{ sshd_conf_dest }}" + owner: root + group: root + mode: u=rw,go= + validate: sshd -t -f %s + notify: + - Restart sshd diff --git a/ansible/roles/sshd/tasks/main.yml b/ansible/roles/sshd/tasks/main.yml new file mode 100644 index 000000000..84f493457 --- /dev/null +++ b/ansible/roles/sshd/tasks/main.yml @@ -0,0 +1 @@ +- import_tasks: configure.yml diff --git a/ansible/roles/sshd/templates/sshd.conf.j2 b/ansible/roles/sshd/templates/sshd.conf.j2 new file mode 100644 index 000000000..2746f0642 --- /dev/null +++ b/ansible/roles/sshd/templates/sshd.conf.j2 @@ -0,0 +1,2 @@ +# {{ ansible_managed }} +PasswordAuthentication {{ 'yes' if sshd_password_authentication | bool else 'no' }} diff --git a/ansible/roles/sssd/README.md b/ansible/roles/sssd/README.md new file mode 100644 index 000000000..da4e63f31 --- /dev/null +++ b/ansible/roles/sssd/README.md @@ -0,0 +1,18 @@ +# sssd + +Install and configure [sssd](https://sssd.io/docs/introduction.html). + + +## Role variables + +The only required configuration is to create a [sssd.conf](https://www.mankier.com/5/sssd.conf) template at the location specified by `sssd_conf_src`. + +- `sssd_packages`: Optional list. Packages to install. +- `sssd_ldap_install`: Optional bool. Whether to install packages enabling SSSD to authenticate against LDAP. Default `false`. +- `sssd_ldap_packages`: Optional list. Packages to install when using `sssd_ldap_install`. +- `sssd_enable_mkhomedir`: Optional bool. Whether to enable creation of home directories on login. Default `false`. +- `sssd_mkhomedir_packages`: Optional list. Packages to install when using `sssd_enable_mkhomedir`. +- `sssd_conf_src`: Optional string. Path to `sssd.conf` template. Default (which must be created) is `{{ appliances_environment_root }}/files/sssd.conf.j2`. +- `sssd_conf_dest`: Optional string. Path to destination for `sssd.conf`. Default `/etc/sssd/sssd.conf`. +- `sssd_started`: Optional bool. Whether `sssd` service should be started. +- `sssd_enabled`: Optional bool. Whether `sssd` service should be enabled. diff --git a/ansible/roles/sssd/defaults/main.yml b/ansible/roles/sssd/defaults/main.yml new file mode 100644 index 000000000..5bc58c990 --- /dev/null +++ b/ansible/roles/sssd/defaults/main.yml @@ -0,0 +1,12 @@ +sssd_packages: + - sssd-common +sssd_install_ldap: false +sssd_ldap_packages: + - sssd-ldap +sssd_enable_mkhomedir: false +sssd_mkhomedir_packages: + - oddjob-mkhomedir +sssd_conf_src: "{{ appliances_environment_root }}/files/sssd.conf.j2" +sssd_conf_dest: /etc/sssd/sssd.conf +sssd_started: true +sssd_enabled: true diff --git a/ansible/roles/sssd/handlers/main.yml b/ansible/roles/sssd/handlers/main.yml new file mode 100644 index 000000000..72c36e736 --- /dev/null +++ b/ansible/roles/sssd/handlers/main.yml @@ -0,0 +1,5 @@ +- name: Restart sssd + systemd: + name: sssd + state: restarted + when: sssd_started | bool diff --git a/ansible/roles/sssd/tasks/configure.yml b/ansible/roles/sssd/tasks/configure.yml new file mode 100644 index 000000000..ae636e9dd --- /dev/null +++ b/ansible/roles/sssd/tasks/configure.yml @@ -0,0 +1,28 @@ +- name: Manage sssd.conf configuration + template: + src: "{{ sssd_conf_src }}" + dest: "{{ sssd_conf_dest }}" + owner: root + group: root + mode: u=rw,go= + notify: "Restart sssd" + +- meta: flush_handlers + +- name: Ensure sssd service state + systemd: + name: sssd + state: "{{ 'started' if sssd_started | bool else 'stopped' }}" + enabled: "{{ sssd_enabled | bool }}" + +- name: Get current authselect configuration + command: authselect current --raw + changed_when: false + failed_when: + - _authselect_current.rc != 0 + - "'No existing configuration detected' not in _authselect_current.stdout" + register: _authselect_current # stdout: sssd with-mkhomedir + +- name: Configure nsswitch and PAM for SSSD + command: "authselect select sssd --force{% if sssd_enable_mkhomedir | bool %} with-mkhomedir{% endif %}" + when: "'sssd' not in _authselect_current.stdout" diff --git a/ansible/roles/sssd/tasks/install.yml b/ansible/roles/sssd/tasks/install.yml new file mode 100644 index 000000000..97aa82a2f --- /dev/null +++ b/ansible/roles/sssd/tasks/install.yml @@ -0,0 +1,13 @@ +- name: Ensure sssd packages are installed + dnf: + name: "{{ sssd_packages + sssd_ldap_packages if (sssd_install_ldap | bool) else [] }}" + +- name: Control if sssd should start on boot + # Needs to be done here to prevent starting after image build, is enabled by default + systemd: + name: sssd + enabled: "{{ sssd_enabled | bool }}" + +- name: Ensure mkhomedir packages are installed if required + dnf: + name: "{{ sssd_mkhomedir_packages }}" diff --git a/ansible/roles/sssd/tasks/main.yml b/ansible/roles/sssd/tasks/main.yml new file mode 100644 index 000000000..2b65e84b4 --- /dev/null +++ b/ansible/roles/sssd/tasks/main.yml @@ -0,0 +1,2 @@ +- import_tasks: install.yml +- import_tasks: configure.yml diff --git a/ansible/roles/zenith_proxy/defaults/main.yml b/ansible/roles/zenith_proxy/defaults/main.yml index dbb920c58..6b1a43aaa 100644 --- a/ansible/roles/zenith_proxy/defaults/main.yml +++ b/ansible/roles/zenith_proxy/defaults/main.yml @@ -15,12 +15,12 @@ zenith_proxy_pod_name: "{{ zenith_proxy_service_name }}" zenith_proxy_client_container_name: "{{ zenith_proxy_client_service_name }}" zenith_proxy_mitm_container_name: "{{ zenith_proxy_mitm_service_name }}" -zenith_proxy_image_tag: '0.1.0' +zenith_proxy_image_tag: '0.12.0' -zenith_proxy_client_image_repository: ghcr.io/stackhpc/zenith-client +zenith_proxy_client_image_repository: ghcr.io/azimuth-cloud/zenith-client zenith_proxy_client_image: "{{ zenith_proxy_client_image_repository }}:{{ zenith_proxy_image_tag }}" -zenith_proxy_mitm_image_repository: ghcr.io/stackhpc/zenith-proxy +zenith_proxy_mitm_image_repository: ghcr.io/azimuth-cloud/zenith-proxy zenith_proxy_mitm_image: "{{ zenith_proxy_mitm_image_repository }}:{{ zenith_proxy_image_tag }}" zenith_proxy_upstream_scheme: http diff --git a/ansible/site.yml b/ansible/site.yml index bb379399d..d973d9cb3 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -27,6 +27,7 @@ - import_playbook: slurm.yml - import_playbook: portal.yml - import_playbook: monitoring.yml +- import_playbook: disable-repos.yml - name: Run post.yml hook vars: diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 0b7397242..cf282f786 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -25,8 +25,9 @@ tags: - openhpc tasks: - - import_role: + - include_role: name: stackhpc.openhpc + tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" - name: Set locked memory limits on user-facing nodes hosts: diff --git a/dev/setup-env.sh b/dev/setup-env.sh index bfa0758e6..6d701f2b7 100755 --- a/dev/setup-env.sh +++ b/dev/setup-env.sh @@ -17,6 +17,7 @@ PYTHON_VERSION="" if [[ "$OS" == "ubuntu" && "$MAJOR_VERSION" == "22" ]]; then PYTHON_VERSION="/usr/bin/python3.10" elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "8" ]]; then + # python3.9+ doesn't have selinux bindings PYTHON_VERSION="/usr/bin/python3.8" # use `sudo yum install python38` on Rocky Linux 8 to install this elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "9" ]]; then PYTHON_VERSION="/usr/bin/python3.9" diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md new file mode 100644 index 000000000..c7c1d4d8c --- /dev/null +++ b/docs/experimental/compute-init.md @@ -0,0 +1,18 @@ +# compute-init + +See the role README.md + +# CI workflow + +The compute node rebuild is tested in CI after the tests for rebuilding the +login and control nodes. The process follows + +1. Compute nodes are reimaged: + + ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml + +2. Ansible-init runs against newly reimaged compute nodes + +3. Run sinfo and check nodes have expected slurm state + + ansible-playbook -v ansible/ci/check_slurm.yml \ No newline at end of file diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md new file mode 100644 index 000000000..fb2cda023 --- /dev/null +++ b/docs/experimental/pulp.md @@ -0,0 +1,17 @@ +# Pulp Server + +In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's "Ark" Pulp server. The appliance can sync relevant repositories to a local Pulp server which will then be used instead of Ark. + +## Deploying/configuring Pulp Server + +### Deploying a Pulp server +A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml` playbook. This can be run with +`ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server="` +where `target_host` is any resolvable host. This will print a Pulp URL which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note access to this server's content isn't authenticated so assumes the server is deployed behind a secure network. + +### Using an existing Pulp server +An existing Pulp server can be used to host Ark repos by overriding `pulp_site_password` and `appliances_pulp_url` in the target environment. Note that this assumes the same configuration as the appliance deployed pulp i.e no content authentication. + +## Syncing Pulp content with Ark + +If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must authenticate with Ark by overriding `pulp_site_upstream_username` and `pulp_site_upstream_password` with your vault encrypted Ark dev credentials. `dnf_repos_username` and `dnf_repos_password` must remain unset to access content from the local Pulp. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`. By default this syncs repositories for Rocky 9.5 with x86_64 architecture, but can be overriden by setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version_major`. diff --git a/docs/image-build.md b/docs/image-build.md index 4896bde57..dc968ebfd 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -2,87 +2,58 @@ The appliance contains code and configuration to use [Packer](https://developer.hashicorp.com/packer) with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. -The Packer configuration defined here builds "fat images" which contain binaries for all nodes, but no cluster-specific configuration. Using these: +The Packer configuration defined here builds "fat images" which contain packages, binaries and container images but no cluster-specific configuration. Using these: - Enables the image to be tested in CI before production use. - Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). - Improves deployment speed by reducing the number of package downloads to improve deployment speed. -By default, a fat image build starts from a nightly image build containing Mellanox OFED, and updates all DNF packages already present. The 'latest' nightly build itself is from a RockyLinux GenericCloud image. - -The fat images StackHPC builds and test in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: +The fat images StackHPC builds and tests in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: 1. Build site-specific fat images from scratch. -2. Extend an existing fat image with additional software. +2. Extend an existing fat image with additional functionality. # Usage -The steps for building site-specific fat images or extending an existing fat image are the same: +To build either a site-specific fat image from scratch, or to extend an existing StackHPC fat image: 1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). -2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum e.g.: +2. The provided dev credentials for StackHPC's "Ark" Pulp server must be added to the target environments. This is done by overriding `dnf_repos_username` and `dnf_repos_password` with your vault encrypted credentials in `environments//inventory/group_vars/all/pulp.yml`. See the [experimental docs](experimental/pulp.md) if you wish instead wish to use a local Pulp server. +3. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum: ```hcl flavor = "general.v1.small" # VM flavor to use for builder VMs networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to + source_image_name = "Rocky-9-GenericCloud-Base-9.4" # Name of image to create VM with, i.e. starting image + inventory_groups = "control,login,compute" # Additional inventory groups to add build VM to + ``` + Note that: - - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). - - For additional options such as non-default private key locations or jumphost configuration see the variable descriptions in `./openstack.pkr.hcl`. - - For an example of configuration for extending an existing fat image see below. + - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). + - The flavor used must have sufficent memory for the build tasks, but otherwise does not need to match the final cluster nodes. Usually 8GB is sufficent. By default, the build VM is volume-backed to allow control of the root disk size (and hence final image size) so the flavor disk size does not matter. + - The source image should be either a RockyLinux GenericCloud image for a site-specific image build from scratch, or a StackHPC fat image if extending an existing image. + - The `inventory_groups` variable takes a comma-separated list of Ansible inventory groups to add the build VM to. This is in addition to the `builder` group which it is always added to. This controls which Ansible roles and functionality run during build, and hence what gets added to the image. All possible groups are listed in `environments/common/groups` but common options for this variable will be: + - `update,control,login,compute`: The resultant image has all packages in the source image updated, and then packages for all types of nodes in the cluster are added. When using a GenericCloud image for `source_image_name` this builds a site-specific fat image from scratch. + - One or more specific groups which are not enabled in the appliance by default, e.g. `lustre`. When using a StackHPC fat image for `source_image_name` this extends the image with just this additional functionality. -3. Activate the venv and the relevant environment. +4. Activate the venv and the relevant environment. -4. Build images using the relevant variable definition file, e.g.: +5. Build images using the relevant variable definition file, e.g.: cd packer/ - PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - - Note that the `-only` flag here restricts Packer to a single specific "build" definition (in Packer terminology). Options here are: - - `-only=openstack.openhpc`: Build a fat image including Mellanox OFED - - `-only=openstack.openhpc-cuda`: Build a fat image including Mellanox OFED, Nvidia drivers and CUDA - - `-only=openstack.openhpc-extra`: Build an image which *extends* an existing fat image - -5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash. - -# Defining an "extra" image build - -An "extra" image build starts with an existing fat image (e.g. one provided by StackHPC) rather than a RockyLinux GenericCloud image, and only runs a specific subset of the -Ansible in the appliance. This allows adding additional functionality into site-specific images, without modifying the existing functionality in the base fat image. This is the recommended way to build site-specific images. - -To configure an "extra" image build, prepare a Packer variable definition file as described above but also including: - -- `extra_build_image_name`: A string to add into the final image name. -- `source_image` or `source_image_name`: The UUID or name of the fat image to start from (which must already be present in OpenStack). -- `extra_build_groups`: A list of Ansible inventory groups to put the build VM into, in addition to the `builder` group. This defines the roles/functionality - which are added to the image. -- `extra_build_volume_size`: A number giving the size in GB of the volume for the build VM's root disk and therefore the resulting image size. - Note this assumes the default of `use_blockstorage_volume = true`. - -E.g. to add the lustre client to an RockyLinux 9 image: - - # environments/site/lustre.pkvars.hcl - - extra_build_image_name = "lustre" # output image name will be like "openhpc-lustre-RL9-$timestamp-$commit" - source_image_name = "openhpc-ofed-RL9-240906-1041-32568dbb" # e.g. current StackHPC RL9 image - extra_build_groups = ["lustre"] # only run lustre role during this extra build - extra_build_volume_size = 15 # default non-CUDA build image size has enough free space - - # ... define flavor, network, etc as normal - - -Then, reference this build and variables file in the Packer build command: + PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc-extra --on-error=ask -var-file=environments/site/lustre.pkvars.hcl openstack.pkr.hcl + **NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: -**NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: + openstack image show $SOURCE_IMAGE - openstack image show $SOURCE_IMAGE + If it does, remove this property: -If it does, remove this property: + openstack image unset --property signature_verified $SOURCE_IMAGE - openstack image unset --property signature_verified $SOURCE_IMAGE + then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [OpenStack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). -then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). +6. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. # Build Process diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index 3e3de38c0..db228d410 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -96,7 +96,7 @@ The `grafana` group controls the placement of the grafana service. Load balancin ### Access -If Open Ondemand is enabled then by default this is used to proxy Grafana, otherwise Grafana is accessed through the first . See `grafana_url` in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). The port used (variable `grafana_port`) defaults to `3000`. +If Open OnDemand is enabled then by default this is used to proxy Grafana, otherwise Grafana is accessed through the first . See `grafana_url` in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). The port used (variable `grafana_port`) defaults to `3000`. The default credentials for the admin user are: diff --git a/docs/openondemand.README.md b/docs/openondemand.md similarity index 61% rename from docs/openondemand.README.md rename to docs/openondemand.md index 5daba3408..6b501d20b 100644 --- a/docs/openondemand.README.md +++ b/docs/openondemand.md @@ -1,28 +1,28 @@ # Overview -The appliance can deploy the Open Ondemand portal. This page describes how to enable this and the default appliance configuration/behaviour. Note that detailed configuration documentation is provided by: +The appliance can deploy the Open OnDemand portal. This page describes how to enable this and the default appliance configuration/behaviour. Note that detailed configuration documentation is provided by: - The README for the included `openondemand` role in this repo - [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md). - The README and default variables for the underlying "official" role which the above wraps - [Open OnDemand Ansible Role](https://github.com/OSC/ood-ansible) -- The documentation for Open Ondemand [itself](https://osc.github.io/ood-documentation/latest/index.html) +- The documentation for Open OnDemand [itself](https://osc.github.io/ood-documentation/latest/index.html) This appliance can deploy and configure: -- The Open Ondemand server itself (usually on a single login node). +- The Open OnDemand server itself (usually on a single login node). - User authentication using one of: - An external OIDC provider. - HTTP basic authenication and PAM. - Virtual desktops on compute nodes. - Jupyter nodebook servers on compute nodes. -- Proxying of Grafana (usually deployed on the control node) via the Open Ondemand portal. -- Links to additional filesystems and pages from the Open Ondemand Dashboard. -- A Prometheus exporter for the Open Ondemand server and related Grafana dashboard +- Proxying of Grafana (usually deployed on the control node) via the Open OnDemand portal. +- Links to additional filesystems and pages from the Open OnDemand Dashboard. +- A Prometheus exporter for the Open OnDemand server and related Grafana dashboard For examples of all of the above see the `smslabs-example` environment in this repo. -# Enabling Open Ondemand -To enable the Open Ondemand server, add single host to the `openondemand` inventory group. Generally, this should be a node in the `login` group, as Open Ondemand must be able to access Slurm commands. +# Enabling Open OnDemand +To enable the Open OnDemand server, add single host to the `openondemand` inventory group. Generally, this should be a node in the `login` group, as Open OnDemand must be able to access Slurm commands. -To enable compute nodes for virtual desktops or Jupyter notebook servers (accessed through the Open Ondemand portal), add nodes/groups to the `openondemand_desktop` and `openondemand_jupyter` inventory groups respectively. These may be all or a subset of the `compute` group. +To enable compute nodes for virtual desktops or Jupyter notebook servers (accessed through the Open OnDemand portal), add nodes/groups to the `openondemand_desktop` and `openondemand_jupyter` inventory groups respectively. These may be all or a subset of the `compute` group. The above functionality is configured by running the `ansible/portal.yml` playbook. This is automatically run as part of `ansible/site.yml`. @@ -30,11 +30,10 @@ The above functionality is configured by running the `ansible/portal.yml` playbo See the [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md) for more details on the variables described below. -At minimum the following must be defined: -- `openondemand_servername` - this must be defined for both `openondemand` and `grafana` hosts (when Grafana is enabled). It is suggested to place it groupvars for `all`. -- `openondemand_auth` and any corresponding options. -- `openondemand_desktop_partition` and `openondemand_jupyter_partition` if the corresponding inventory groups are defined. -- `openondemand_host_regex` if `openondemand_desktop` or `openondemand_jupyter` inventory groups are defined and/or proxying Grafana via Open Ondemand is required. +The following variables have been given default values to allow Open OnDemand to work in a newly created environment without additional configuration, but generally should be overridden in `environment/site/inventory/group_vars/all/` with site-specific values: +- `openondemand_servername` - this must be defined for both `openondemand` and `grafana` hosts (when Grafana is enabled). Default is `ansible_host` (i.e. the IP address) of the first host in the `openondemand` group. +- `openondemand_auth` and any corresponding options. Defaults to `basic_pam`. +- `openondemand_desktop_partition` and `openondemand_jupyter_partition` if the corresponding inventory groups are defined. Defaults to the first compute group defined in the `compute` Terraform variable in `environments/$ENV/terraform`. It is also recommended to set: - `openondemand_dashboard_support_url` @@ -42,6 +41,9 @@ It is also recommended to set: If shared filesystems other than `$HOME` are available, add paths to `openondemand_filesapp_paths`. -The appliance automatically configures Open Ondemand to proxy Grafana and adds a link to it on the Open Ondemand dashboard. This means no external IP (or SSH proxying etc) is required to access Grafana (which by default is deployed on the control node). To allow users to authenticate to Grafana, the simplest option is to enable anonymous (View-only) login by setting `grafana_auth_anonymous` (see [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml)[^1]). +The appliance automatically configures Open OnDemand to proxy Grafana and adds a link to it on the Open OnDemand dashboard. This means no external IP (or SSH proxying etc) is required to access Grafana (which by default is deployed on the control node). To allow users to authenticate to Grafana, the simplest option is to enable anonymous (View-only) login by setting `grafana_auth_anonymous` (see [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml)[^1]). -[^1]: Note that if `openondemand_auth` is `basic_pam` and anonymous Grafana login is enabled, the appliance will (by default) configure Open Ondemand's Apache server to remove the Authorisation header from proxying of all `node/` addresses. This is done as otherwise Grafana tries to use this header to authenticate, which fails with the default configuration where only the admin Grafana user `grafana` is created. Note that the removal of this header in this configuration means it cannot be used to authenticate proxied interactive applications - however the appliance-deployed remote desktop and Jupyter Notebook server applications use other authentication methods. An alternative if using `basic_pam` is not to enable anonymous Grafana login and to create Grafana users matching the local users (e.g. in `environments//hooks/post.yml`). +[^1]: Note that if `openondemand_auth` is `basic_pam` and anonymous Grafana login is enabled, the appliance will (by default) configure Open OnDemand's Apache server to remove the Authorisation header from proxying of all `node/` addresses. This is done as otherwise Grafana tries to use this header to authenticate, which fails with the default configuration where only the admin Grafana user `grafana` is created. Note that the removal of this header in this configuration means it cannot be used to authenticate proxied interactive applications - however the appliance-deployed remote desktop and Jupyter Notebook server applications use other authentication methods. An alternative if using `basic_pam` is not to enable anonymous Grafana login and to create Grafana users matching the local users (e.g. in `environments//hooks/post.yml`). + +# Access +By default the appliance authenticates against OOD with basic auth through PAM. When creating a new environment, a new user with username `demo_user` will be created. Its password is found under `vault_openondemand_default_user` in the appliance secrets store in `environments/{ENV}/inventory/group_vars/all/secrets.yml`. Other users can be defined by overriding the `basic_users_users` variable in your environment (templated into `environments/{ENV}/inventory/group_vars/all/basic_users.yml` by default). diff --git a/docs/operations.md b/docs/operations.md index a20d7f10c..7a0a5b919 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -63,17 +63,46 @@ This is a usually a two-step process: Deploying the additional nodes and applying these changes requires rerunning both Terraform and the Ansible site.yml playbook - follow [Deploying a Cluster](#Deploying-a-Cluster). # Adding Additional Packages -Packages from any enabled DNF repositories (which always includes EPEL, PowerTools and OpenHPC) can be added to all nodes by defining a list `openhpc_packages_extra` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/openhpc.yml`. For example: - - # environments/foo-base/inventory/group_vars/all/openhpc.yml: - openhpc_packages_extra: - - somepackage - - anotherpackage - - -The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the [OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note "user-facing" OpenHPC packages such as compilers, mpi libraries etc. include corresponding `lmod` modules. - -To add these packages to the current cluster, run the same command as for [Reconfiguring Slurm](#Reconfiguring-Slurm). TODO: describe what's required to add these to site-specific images. +By default, the following utility packages are installed during the StackHPC image build: +- htop +- nano +- screen +- tmux +- wget +- bind-utils +- net-tools +- postfix +- git +- latest python version for system (3.6 for for Rocky 8.9 and 3.12 for Rocky 9.4) + +Additional packages can be added during image builds by: +- adding the `extra_packages` group to the build `inventory_groups` (see +[docs/image-build.md](./image-build.md)) +- defining a list of packages in `appliances_extra_packages_other` in e.g. +`environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: + + ```yaml + # environments/foo-base/inventory/group_vars/all/defaults.yml: + appliances_extra_packages_other: + - somepackage + - anotherpackage + ``` + +For packages which come from repositories mirroed by StackHPC's "Ark" Pulp server +(including rocky, EPEL and OpenHPC repositories), this will require either [Ark +credentials](./image-build.md)) or a [local Pulp mirror](./experimental/pulp.md) +to be configured. This includes rocky, EPEL and OpenHPC repos. + +The packages available from the OpenHPC repos are described in Appendix E of +the OpenHPC installation guide (linked from the +[OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note +"user-facing" OpenHPC packages such as compilers, mpi libraries etc. include +corresponding `lmod` modules. + +Packages *may* also be installed during the site.yml, by adding the `cluster` +group into the `extra_packages` group. An error will occur if Ark credentials +are defined in this case, as they are readable by unprivileged users in the +`.repo` files and a local Pulp mirror must be used instead. If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a plat should NOT exclude the builder group, so that the repositories are also added to built images. There are various Ansible modules which might be useful for this: - `ansible.builtin.yum_repository`: Add a repo from an URL providing a 'repodata' directory. diff --git a/docs/production.md b/docs/production.md index 7219ee7fc..59b9f3775 100644 --- a/docs/production.md +++ b/docs/production.md @@ -1,9 +1,129 @@ # Production Deployments -This page contains some brief notes about differences between the default/demo configuration, as described in the main [README.md](../README.md) and production-ready deployments. +This page contains some brief notes about differences between the default/demo +configuration (as described in the main [README.md](../README.md)) and +production-ready deployments. -- Create a site environment. Usually at least production, staging and possibly development environments are required. To avoid divergence of configuration these should all have an `inventory` path referencing a shared, site-specific base environment. Where possible hooks should also be placed in this site-specific environment. -- Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`. To ensure staging environments are a good model for production this should generally be moved into the site-specific environment. It can be be encrypted using [Ansible vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html) and then committed to the repository. -- Ensure created instances have accurate/synchronised time. For VM instances this is usually provided by the hypervisor, but if not (or for bare metal instances) it may be necessary to configure or proxy `chronyd` via an environment hook. -- Remove production volumes from OpenTofu control. In the default OpenTofu configuration, deleting the resources also deletes the volumes used for persistent state and home directories. This is usually undesirable for production, so these resources should be removed from the OpenTofu configurations and manually deployed once. However note that for development environments leaving them under OpenTofu control is usually best. -- Configure Open OpenOndemand - see [specific documentation](openondemand.README.md). +- Get it agreed up front what the cluster names will be. Changing this later + requires instance deletion/recreation. + +- At least three environments should be created: + - `site`: site-specific base environment + - `production`: production environment + - `staging`: staging environment + + A `dev` environment should also be created if considered required, or this + can be left until later., + + These can all be produced using the cookicutter instructions, but the + `production` and `staging` environments will need their + `environments/$ENV/ansible.cfg` file modifying so that they point to the + `site` environment: + + ```ini + inventory = ../common/inventory,../site/inventory,inventory + ``` + +- To avoid divergence of configuration all possible overrides for group/role +vars should be placed in `environments/site/inventory/group_vars/all/*.yml` +unless the value really is environment-specific (e.g. DNS names for +`openondemand_servername`). + +- Where possible hooks should also be placed in `environments/site/hooks/` +and referenced from the `site` and `production` environments, e.g.: + + ```yaml + # environments/production/hooks/pre.yml: + - name: Import parent hook + import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml" + ``` + +- OpenTofu configurations should be defined in the `site` environment and used + as a module from the other environments. This can be done with the + cookie-cutter generated configurations: + - Delete the *contents* of the cookie-cutter generated `terraform/` directories + from the `production` and `staging` environments. + - Create a `main.tf` in those directories which uses `site/terraform/` as a + [module](https://opentofu.org/docs/language/modules/), e.g. : + + ``` + ... + module "cluster" { + source = "../../site/terraform/" + + cluster_name = "foo" + ... + } + ``` + + Note that: + - Environment-specific variables (`cluster_name`) should be hardcoded + into the module block. + - Environment-independent variables (e.g. maybe `cluster_net` if the + same is used for staging and production) should be set as *defaults* + in `environments/site/terraform/variables.tf`, and then don't need to + be passed in to the module. + +- Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates + a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`. + To ensure staging environments are a good model for production this should + generally be moved into the `site` environment. It should be be encrypted + using [Ansible vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html) + and then committed to the repository. + +- Ensure created instances have accurate/synchronised time. For VM instances + this is usually provided by the hypervisor, but if not (or for bare metal + instances) it may be necessary to configure or proxy `chronyd` via an + environment hook. + +- The cookiecutter provided OpenTofu configurations define resources for home and + state volumes. The former may not be required if the cluster's `/home` is + provided from an external filesystem (or Manila). In any case, in at least + the production environment, and probably also in the staging environment, + the volumes should be manually created and the resources changed to [data + resources](https://opentofu.org/docs/language/data-sources/). This ensures that even if the cluster is deleted via tofu, the + volumes will persist. + + For a development environment, having volumes under tofu control via volume + resources is usually appropriate as there may be many instantiations + of this environment. + +- Enable `etc_hosts` templating: + + ```yaml + # environments/site/inventory/groups: + [etc_hosts:children] + cluster + ``` + +- Configure Open OnDemand - see [specific documentation](openondemand.README.md). + +- Remove the `demo_user` user from `environments/$ENV/inventory/group_vars/all/basic_users.yml` + +- Consider whether having (read-only) access to Grafana without login is OK. If not, remove `grafana_auth_anonymous` in `environments/$ENV/inventory/group_vars/all/grafana.yml` + +- Modify `environments/site/terraform/nodes.tf` to provide fixed IPs for at least + the control node, and (if not using FIPs) the login node(s): + + ``` + resource "openstack_networking_port_v2" "control" { + ... + fixed_ip { + subnet_id = data.openstack_networking_subnet_v2.cluster_subnet.id + ip_address = var.control_ip_address + } + } + ``` + + Note the variable `control_ip_address` is new. + + Using fixed IPs will require either using admin credentials or policy changes. + +- If floating IPs are required for login nodes, modify the OpenTofu configurations + appropriately. + +- Consider whether mapping of baremetal nodes to ironic nodes is required. See + [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). + +- Note [PR 473](https://github.com/stackhpc/ansible-slurm-appliance/pull/473) + may help identify any site-specific configuration. diff --git a/environments/.caas/inventory/extra_groups b/environments/.caas/inventory/extra_groups index d60ae7839..45a1dc7aa 100644 --- a/environments/.caas/inventory/extra_groups +++ b/environments/.caas/inventory/extra_groups @@ -14,6 +14,3 @@ compute [podman:children] zenith - -[persist_hostkeys:children] -openondemand diff --git a/environments/.caas/inventory/group_vars/all/nfs.yml b/environments/.caas/inventory/group_vars/all/nfs.yml index 14fff6295..f42422601 100644 --- a/environments/.caas/inventory/group_vars/all/nfs.yml +++ b/environments/.caas/inventory/group_vars/all/nfs.yml @@ -1,14 +1,5 @@ nfs_server: "{{ nfs_server_default }}" -caas_nfs_ood_state: - - comment: Export /var/lib/state from Slurm control node to OOD - nfs_enable: - server: "{{ inventory_hostname in groups['control'] }}" - clients: "{{ inventory_hostname in groups['openondemand'] }}" - nfs_export: "{{ appliances_state_dir }}" - nfs_client_mnt_point: "{{ appliances_state_dir }}" - nfs_client_mnt_options: "x-systemd.required-by=zenith-ood.service,x-systemd.before=zenith-ood.service" - caas_nfs_home: - comment: Export /exports/home from Slurm control node as /home nfs_enable: @@ -17,4 +8,4 @@ caas_nfs_home: nfs_export: "/exports/home" # assumes skeleton TF is being used nfs_client_mnt_point: "/home" -nfs_configurations: "{{ caas_nfs_ood_state + (caas_nfs_home if not cluster_home_manila_share | bool else []) }}" +nfs_configurations: "{{ caas_nfs_home if not cluster_home_manila_share | bool else [] }}" diff --git a/environments/.caas/inventory/group_vars/all/selinux.yml b/environments/.caas/inventory/group_vars/all/selinux.yml deleted file mode 100644 index 1f1098126..000000000 --- a/environments/.caas/inventory/group_vars/all/selinux.yml +++ /dev/null @@ -1 +0,0 @@ -selinux_state: disabled \ No newline at end of file diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl index 5adf4199c..db0b28b49 100644 --- a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl +++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl @@ -4,4 +4,7 @@ networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] -floating_ip_network = "external" +# see environments/.stackhpc/inventory/group_vars/all/bastion.yml: +ssh_bastion_username = "slurm-app-ci" +ssh_bastion_host = "195.114.30.222" +ssh_bastion_private_key_file = "~/.ssh/id_rsa" diff --git a/environments/.stackhpc/SMS.pkrvars.hcl b/environments/.stackhpc/SMS.pkrvars.hcl index b88106fe8..3ebe734eb 100644 --- a/environments/.stackhpc/SMS.pkrvars.hcl +++ b/environments/.stackhpc/SMS.pkrvars.hcl @@ -2,6 +2,7 @@ flavor = "general.v1.small" networks = ["e2b9e59f-43da-4e1c-b558-dc9da4c0d738"] # stackhpc-ipv4-geneve ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" +# see environments/.stackhpc/inventory/group_vars/all/bastion.yml: ssh_bastion_username = "slurm-app-ci" ssh_bastion_host = "185.45.78.150" -ssh_bastion_private_key_file = "~/.ssh/id_rsa" \ No newline at end of file +ssh_bastion_private_key_file = "~/.ssh/id_rsa" diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index bd60015d9..9d506d725 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -11,4 +11,4 @@ with_items: - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock" - - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock \ No newline at end of file + - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index 7c9a7c774..2531b803e 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -31,3 +31,7 @@ compute [squid:children] # Install squid into fat image builder + +[sssd:children] +# Install sssd into fat image +builder diff --git a/environments/.stackhpc/inventory/group_vars/all/basic_users.yml b/environments/.stackhpc/inventory/group_vars/all/basic_users.yml index ae416cf72..e2088ffd9 100644 --- a/environments/.stackhpc/inventory/group_vars/all/basic_users.yml +++ b/environments/.stackhpc/inventory/group_vars/all/basic_users.yml @@ -1,6 +1,6 @@ -test_user_password: "{{ lookup('env', 'TESTUSER_PASSWORD') | default(vault_testuser_password, true) }}" # CI uses env, debug can set vault_testuser_password +test_demo_user_password: "{{ lookup('env', 'DEMO_USER_PASSWORD') | default(vault_demo_user_password, true) }}" # CI uses env, debug can set vault_demo_user_password basic_users_users: - - name: testuser # can't use rocky as $HOME isn't shared! - password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent + - name: demo_user # can't use rocky as $HOME isn't shared! + password: "{{ test_demo_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent uid: 1005 diff --git a/environments/.stackhpc/inventory/group_vars/all/freeipa.yml b/environments/.stackhpc/inventory/group_vars/all/freeipa.yml index 4b3750650..9a979ab16 100644 --- a/environments/.stackhpc/inventory/group_vars/all/freeipa.yml +++ b/environments/.stackhpc/inventory/group_vars/all/freeipa.yml @@ -2,8 +2,8 @@ # NB: Users defined this way have expired passwords freeipa_users: - - name: testuser # can't use rocky as $HOME isn't shared! - password: "{{ test_user_password }}" + - name: demo_user # can't use rocky as $HOME isn't shared! + password: "{{ test_demo_user_password }}" givenname: test sn: test diff --git a/environments/.stackhpc/inventory/group_vars/grafana/overrides.yml b/environments/.stackhpc/inventory/group_vars/all/grafana.yml similarity index 100% rename from environments/.stackhpc/inventory/group_vars/grafana/overrides.yml rename to environments/.stackhpc/inventory/group_vars/all/grafana.yml diff --git a/environments/.stackhpc/inventory/group_vars/all/manila.yml b/environments/.stackhpc/inventory/group_vars/all/manila.yml new file mode 100644 index 000000000..59f935873 --- /dev/null +++ b/environments/.stackhpc/inventory/group_vars/all/manila.yml @@ -0,0 +1,7 @@ +os_manila_mount_shares_arcus: + - share_name: slurm-v2-home + mount_path: /project + - share_name: slurm-scratch + mount_path: /scratch + +os_manila_mount_shares: "{{ os_manila_mount_shares_arcus if ci_cloud == 'ARCUS' else [] }}" diff --git a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml b/environments/.stackhpc/inventory/group_vars/all/openhpc.yml similarity index 100% rename from environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml rename to environments/.stackhpc/inventory/group_vars/all/openhpc.yml diff --git a/environments/.stackhpc/inventory/group_vars/all/openondemand.yml b/environments/.stackhpc/inventory/group_vars/all/openondemand.yml index 11d475664..72b6cf476 100644 --- a/environments/.stackhpc/inventory/group_vars/all/openondemand.yml +++ b/environments/.stackhpc/inventory/group_vars/all/openondemand.yml @@ -1 +1,8 @@ -openondemand_servername: "{{ hostvars[ groups['openondemand'] | first].ansible_host }}" # Use a SOCKS proxy to acccess +openondemand_auth: basic_pam +openondemand_jupyter_partition: standard +openondemand_desktop_partition: standard +#openondemand_dashboard_support_url: +#openondemand_dashboard_docs_url: +#openondemand_filesapp_paths: +ondemand_package: ondemand-"{{ ondemand_package_version }}" +ondemand_package_version: '3.1.10' diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 8d7ee98d2..10b15adac 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1 +1,21 @@ #update_enable: false # Can uncomment for speed debugging non-update related build issues +sssd_install_ldap: true # include sssd-ldap package in fatimage +# update_enable: false # Can uncomment for speed debugging non-update related build issues + +# Uncomment below to use CI pulp servers + +# pulp_server_config: +# LEAFCLOUD: +# url: http://192.168.10.157:8080 +# password: lookup('env','LEAFCLOUD_PULP_PASSWORD') + +# appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" +# pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" + +# Alternatively, configure to use ark directly: +dnf_repos_username: slurm-app-ci +dnf_repos_password: "{{ lookup('env','ARK_PASSWORD') }}" + +# Can be set regardless of approach above: +pulp_site_upstream_username: slurm-app-ci +pulp_site_upstream_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" diff --git a/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml b/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml deleted file mode 100644 index 72b6cf476..000000000 --- a/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml +++ /dev/null @@ -1,8 +0,0 @@ -openondemand_auth: basic_pam -openondemand_jupyter_partition: standard -openondemand_desktop_partition: standard -#openondemand_dashboard_support_url: -#openondemand_dashboard_docs_url: -#openondemand_filesapp_paths: -ondemand_package: ondemand-"{{ ondemand_package_version }}" -ondemand_package_version: '3.1.10' diff --git a/environments/.stackhpc/inventory/group_vars/selinux/overrides.yml b/environments/.stackhpc/inventory/group_vars/selinux/overrides.yml deleted file mode 100644 index c3b28b913..000000000 --- a/environments/.stackhpc/inventory/group_vars/selinux/overrides.yml +++ /dev/null @@ -1 +0,0 @@ -selinux_state: disabled diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index f9a2087c8..37bd8c3d6 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241118-0918-4538c6df", - "RL9": "openhpc-RL9-241118-0918-4538c6df" + "RL8": "openhpc-RL8-250114-1627-bccc88b5", + "RL9": "openhpc-RL9-250114-1626-bccc88b5" } } diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index d14896dd1..b76fca322 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -88,6 +88,7 @@ module "cluster" { standard: { # NB: can't call this default! nodes: ["compute-0", "compute-1"] flavor: var.other_node_flavor + compute_init_enable: ["compute", "etc_hosts", "nfs", "basic_users", "eessi"] } # Example of how to add another partition: diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 15340820f..23aafd73e 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -6,6 +6,8 @@ appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }} appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it #appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform +appliances_mode: configure +appliances_pulp_url: https://ark.stackhpc.com # Address(ip/dns) for internal communication between services. This is # normally traffic you do no want to expose to users. @@ -78,4 +80,92 @@ appliances_local_users_default: appliances_local_users_extra: [] # see format of appliances_local_users_default above appliances_local_users: "{{ appliances_local_users_default + appliances_local_users_extra }}" -########################################################################################### +################## bootstrap: extra package installs ###################################### + +appliances_extra_packages_default: + - htop + - nano + - screen + - tmux + - wget + - bind-utils + - net-tools + - postfix + - git + - "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}" + +appliances_extra_packages_other: [] + +appliances_extra_packages: "{{ appliances_extra_packages_default + appliances_extra_packages_other }}" + +###################### ark repo timestamps ################################################### + +appliances_pulp_repos: + baseos: + '9.4': + timestamp: 20241115T011711 + path: rocky/9.4/BaseOS/x86_64/os + '9.5': + timestamp: 20241216T013503 + path: rocky/9.5/BaseOS/x86_64/os + '8.10': + timestamp: 20241217T123729 + path: rocky/8.10/BaseOS/x86_64/os + appstream: + '9.4': + timestamp: 20241112T003151 + path: rocky/9.4/AppStream/x86_64/os + '9.5': + timestamp: 20241217T005008 + path: rocky/9.5/AppStream/x86_64/os + '8.10': + timestamp: 20241217T123729 + path: rocky/8.10/AppStream/x86_64/os + crb: + '9.4': + timestamp: 20241115T003133 + path: rocky/9.4/CRB/x86_64/os + '9.5': + timestamp: 20241217T005008 + path: rocky/9.5/CRB/x86_64/os + '8.10': + timestamp: 20241217T123729 + path: rocky/8.10/PowerTools/x86_64/os + extras: + '9.4': + timestamp: 20241118T002802 + path: rocky/9.4/extras/x86_64/os + '9.5': + timestamp: 20241218T004632 + path: rocky/9.5/extras/x86_64/os + '8.10': + timestamp: 20241217T123729 + path: rocky/8.10/extras/x86_64/os + epel: + '9': + timestamp: 20241213T010218 + path: epel/9/Everything/x86_64 + '8': + timestamp: 20241216T235733 + path: epel/8/Everything/x86_64 + openhpc_base: + '8': + path: OpenHPC/2/EL_8 + timestamp: 20241218T154614 + '9': + path: OpenHPC/3/EL_9 + timestamp: 20241218T154614 + openhpc_updates: + '8': + path: OpenHPC/2/updates/EL_8 + timestamp: 20241218T154614 + '9': + path: OpenHPC/3/updates/EL_9 + timestamp: 20241218T154614 + ceph: + '8': + timestamp: 20231104T015751 + path: centos/8-stream/storage/x86_64/ceph-quincy + '9': + timestamp: 20240923T233036 + path: centos/9-stream/storage/x86_64/ceph-reef diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index bd340b190..45b7c6967 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -3,7 +3,7 @@ # See: https://github.com/stackhpc/ansible-role-cluster-nfs # for variable definitions -nfs_server_default: "{{ hostvars[groups['control'] | first ].internal_address }}" +nfs_server_default: "{{ groups['control'] | first }}" # avoid using hostvars for compute-init nfs_configurations: - comment: Export /exports/home from Slurm control node as /home @@ -15,3 +15,9 @@ nfs_configurations: nfs_server: "{{ nfs_server_default }}" nfs_export: "/exports/home" # assumes skeleton TF is being used nfs_client_mnt_point: "/home" + + - comment: Export /exports/cluster from Slurm control node + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: false + nfs_export: "/exports/cluster" diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index c613fc697..3b3879de9 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -13,7 +13,7 @@ openhpc_slurm_accounting_storage_type: 'accounting_storage/slurmdbd' openhpc_slurmdbd_mysql_database: slurm_acct_db openhpc_slurmdbd_mysql_password: "{{ vault_mysql_slurm_password }}" openhpc_slurmdbd_mysql_username: slurm -openhpc_slurm_control_host: "{{ hostvars[groups['control'].0].api_address }}" +openhpc_slurm_control_host: "{{ groups['control'] | first }}" # avoid using hostvars for compute-init openhpc_slurmdbd_host: "{{ openhpc_slurm_control_host }}" openhpc_slurm_partitions: - name: "compute" @@ -38,3 +38,16 @@ openhpc_config_default: openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}" + +openhpc_install_type: ohpc # 'ohpc' or 'generic', see https://github.com/stackhpc/ansible-slurm-appliance/pull/326 + +# Empty repo lists from stackhpc.openhpc role defaults, as these repofiles are +# now generated by dnf_repos to allow injecting Ark creds: +ohpc_openhpc_repos: + "9": [] + "8": [] + +# overriding to ensure doesn't overwrite Ark epel repo +ohpc_default_extra_repos: + "9": [] + "8": [] diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 5e85392ca..cce923fcc 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -5,7 +5,12 @@ # NB: Variables prefixed ood_ are all from https://github.com/OSC/ood-ansible -# openondemand_servername: '' # Must be defined when using openondemand +openondemand_servername: "{{ hostvars[groups['openondemand'].0].ansible_host if groups['openondemand'] else '' }}" + +openondemand_auth: basic_pam + +openondemand_jupyter_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" +openondemand_desktop_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" # Regex defining hosts which openondemand can proxy; the default regex is compute nodes (for apps) and grafana host, # e.g. if the group `compute` has hosts `compute-{0,1,2,..}` this will be '(compute-\d+)|(control)'. diff --git a/environments/common/inventory/group_vars/all/os-manila-mount.yml b/environments/common/inventory/group_vars/all/os-manila-mount.yml new file mode 100644 index 000000000..6b25d62cb --- /dev/null +++ b/environments/common/inventory/group_vars/all/os-manila-mount.yml @@ -0,0 +1,3 @@ +# Empty repo lists from stackhpc.ansible-role-os-manila-mount role defaults, as these repofiles are +# now generated by dnf_repos to allow injecting Ark creds: +os_manila_mount_ceph_rpm_repos: [] diff --git a/environments/common/inventory/group_vars/all/pulp.yml b/environments/common/inventory/group_vars/all/pulp.yml new file mode 100644 index 000000000..22bb83216 --- /dev/null +++ b/environments/common/inventory/group_vars/all/pulp.yml @@ -0,0 +1,11 @@ +pulp_site_port: 8080 + +# If using Ark directly (no local Pulp server), override the following with Ark creds + +# dnf_repos_username: +# dnf_repos_password: + +# If instead using local Pulp server, override below with Ark creds + +# pulp_site_upstream_username: +# pulp_site_upstream_password: diff --git a/environments/common/inventory/group_vars/all/selinux.yml b/environments/common/inventory/group_vars/all/selinux.yml index 25fbbd68f..fef5c3f58 100644 --- a/environments/common/inventory/group_vars/all/selinux.yml +++ b/environments/common/inventory/group_vars/all/selinux.yml @@ -1,4 +1,4 @@ --- -selinux_state: permissive +selinux_state: disabled selinux_policy: targeted diff --git a/environments/common/inventory/group_vars/all/sshd.yaml b/environments/common/inventory/group_vars/all/sshd.yaml new file mode 100644 index 000000000..5d4ed228f --- /dev/null +++ b/environments/common/inventory/group_vars/all/sshd.yaml @@ -0,0 +1 @@ +sshd_password_authentication: "{{ sssd_install_ldap | default(false) | bool }}" diff --git a/environments/common/inventory/group_vars/builder/defaults.yml b/environments/common/inventory/group_vars/builder/defaults.yml index 22042c1bf..dae4edd9a 100644 --- a/environments/common/inventory/group_vars/builder/defaults.yml +++ b/environments/common/inventory/group_vars/builder/defaults.yml @@ -22,3 +22,6 @@ squid_cache_disk: 0 # just needs to be defined squid_cache_mem: 0 tuned_started: false tuned_enabled: false +sssd_started: false +sssd_enabled: false +appliances_mode: build diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 9b9aa5bf0..cb49b92e2 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -13,9 +13,6 @@ login control compute -[eessi:children] -# Hosts on which EESSI stack should be configured - [hpctests:children] # Login group to use for running mpi-based testing. login @@ -79,9 +76,6 @@ cluster # Hosts to install firewalld on - see ansible/roles/filewalld fail2ban -[block_devices] -# Superset of hosts to configure filesystems on - see ansible/roles/block_devices/README.md - [basic_users] # Add `openhpc` group to add slurm users via creation of users on each node. @@ -118,12 +112,18 @@ freeipa_client [cuda] # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md +[eessi] +# Hosts on which EESSI stack should be configured + [resolv_conf] # Allows defining nameservers in /etc/resolv.conf - see ansible/roles/resolv_conf/README.md [proxy] # Hosts to configure http/s proxies - see ansible/roles/proxy/README.md +[manila] +# Hosts to configure for manila fileshares + [persist_hostkeys] # Hosts to persist hostkeys for across reimaging. NB: Requires appliances_state_dir on hosts. @@ -136,6 +136,15 @@ freeipa_client [ansible_init] # Hosts to run linux-anisble-init +[sssd] +# Hosts to configure sssd on + +[sshd] +# Hosts where the OpenSSH server daemon should be configured + +[compute_init] +# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on + [k3s] # Hosts to run k3s server/agent @@ -144,3 +153,15 @@ freeipa_client [lustre] # Hosts to run lustre client + +[extra_packages] +# Hosts to install specified additional packages on + +[dnf_repos:children] +# Hosts to replace system repos with Pulp repos +# Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users +builder +extra_packages + +[pulp] +# Add builder to this group to enable automatically syncing of pulp during image build diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index ba5cbc08d..8b5046bfc 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -36,8 +36,9 @@ login [block_devices:children] # Environment-specific so not defined here -[basic_users] +[basic_users:children] # Add `openhpc` group to add Slurm users via creation of users on each node. +openhpc [openondemand:children] # Host to run Open Ondemand server on - subset of login @@ -51,13 +52,15 @@ compute # Subset of compute to run a Jupyter Notebook servers on via Open Ondemand compute -[etc_hosts] +[etc_hosts:children] # Hosts to manage /etc/hosts e.g. if no internal DNS. See ansible/roles/etc_hosts/README.md +cluster [cuda] # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md [eessi:children] +# Hosts on which EESSI stack should be configured openhpc [resolv_conf] @@ -69,8 +72,10 @@ openhpc [manila] # Hosts to configure for manila fileshares -[persist_hostkeys] -# Hosts to persist hostkeys for across reimaging. NB: Requires appliances_state_dir on hosts. +[persist_hostkeys:children] +# Hosts to use common set of hostkeys which persist across reimaging. +login +openondemand [squid] # Hosts to run squid proxy @@ -79,9 +84,19 @@ openhpc # Hosts to run TuneD configuration [ansible_init:children] -# Hosts to run ansible-init +# Hosts to run linux-anisble-init cluster +[sssd] +# Hosts to configure sssd on + +[sshd] +# Hosts where the OpenSSH server daemon should be configured + +[compute_init:children] +# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on +compute + [k3s:children] # Hosts to run k3s server/agent openhpc @@ -92,3 +107,7 @@ control [lustre] # Hosts to run lustre client + +[extra_packages:children] +# Hosts to install specified additional packages on +builder diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml new file mode 100644 index 000000000..dc993c3b8 --- /dev/null +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml @@ -0,0 +1,4 @@ +basic_users_users: + - name: demo_user + password: "{% raw %}{{ vault_demo_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}{% endraw %}" # idempotent + uid: 1005 diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml new file mode 100644 index 000000000..521616a1b --- /dev/null +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml @@ -0,0 +1 @@ +grafana_auth_anonymous: true \ No newline at end of file diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index abc3494f0..c8a2b22f4 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -3,22 +3,29 @@ module "compute" { for_each = var.compute + # must be set for group: nodes = each.value.nodes + flavor = each.value.flavor + cluster_name = var.cluster_name cluster_domain_suffix = var.cluster_domain_suffix cluster_net_id = data.openstack_networking_network_v2.cluster_net.id cluster_subnet_id = data.openstack_networking_subnet_v2.cluster_subnet.id - flavor = each.value.flavor + # can be set for group, defaults to top-level value: image_id = lookup(each.value, "image_id", var.cluster_image_id) vnic_type = lookup(each.value, "vnic_type", var.vnic_type) vnic_profile = lookup(each.value, "vnic_profile", var.vnic_profile) + volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) + root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) + extra_volumes = lookup(each.value, "extra_volumes", {}) + + compute_init_enable = lookup(each.value, "compute_init_enable", []) + key_pair = var.key_pair environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] + control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] baremetal_nodes = data.external.baremetal_nodes.result - volume_backed_instances = var.volume_backed_instances - root_volume_size = var.root_volume_size } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 6ad510840..bc96c926f 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -1,3 +1,33 @@ +locals { + all_compute_volumes = {for v in setproduct(var.nodes, keys(var.extra_volumes)): "${v[0]}-${v[1]}" => {"node" = v[0], "volume" = v[1]}} + # e.g. with + # var.nodes = ["compute-0", "compute-1"] + # var.extra_volumes = { + # "vol-a" = {size = 10}, + # "vol-b" = {size = 20} + # } + # this is a mapping with + # keys "compute-0-vol-a", "compute-0-vol-b" ... + # values which are a mapping e.g. {"node"="compute-0", "volume"="vol-a"} +} + +resource "openstack_blockstorage_volume_v3" "compute" { + + for_each = local.all_compute_volumes + + name = "${var.cluster_name}-${each.key}" + description = "Compute node ${each.value.node} volume ${each.value.volume}" + size = var.extra_volumes[each.value.volume].size +} + +resource "openstack_compute_volume_attach_v2" "compute" { + + for_each = local.all_compute_volumes + + instance_id = openstack_compute_instance_v2.compute["${each.value.node}"].id + volume_id = openstack_blockstorage_volume_v3.compute["${each.key}"].id +} + resource "openstack_networking_port_v2" "compute" { for_each = toset(var.nodes) @@ -44,11 +74,14 @@ resource "openstack_compute_instance_v2" "compute" { access_network = true } - metadata = { - environment_root = var.environment_root - k3s_token = var.k3s_token - k3s_server = var.k3s_server - } + metadata = merge( + { + environment_root = var.environment_root + k3s_token = var.k3s_token + control_address = var.control_address + }, + {for e in var.compute_init_enable: e => true} + ) availability_zone = var.match_ironic_node ? "${var.availability_zone}::${var.baremetal_nodes[each.key]}" : null diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index 98992322d..571ced0e5 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -60,6 +60,18 @@ variable "root_volume_size" { type = number } +variable "extra_volumes" { + description = <<-EOF + Mapping defining additional volumes to create and attach. + Keys are unique volume name. + Values are a mapping with: + size: Size of volume in GB + **NB**: The order in /dev is not guaranteed to match the mapping + EOF + type = any + default = {} +} + variable "security_group_ids" { type = list } @@ -68,10 +80,6 @@ variable "k3s_token" { type = string } -variable "k3s_server" { - type = string -} - variable "match_ironic_node" { description = "Whether to launch instances on the Ironic node of the same name as this cluster node" type = bool @@ -88,3 +96,14 @@ variable availability_zone { variable "baremetal_nodes" { type = map(string) } + +variable "control_address" { + description = "Name/address of control node" + type = string +} + +variable "compute_init_enable" { + type = list(string) + description = "Groups to activate for ansible-init compute rebuilds" + default = [] +} diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf index 5b12f4354..e091ae003 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf @@ -145,7 +145,7 @@ resource "openstack_compute_instance_v2" "login" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] + control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] } availability_zone = each.value.match_ironic_node ? "${each.value.availability_zone}::${data.external.baremetal_nodes.result[each.key]}" : null diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 5fa262128..0279af019 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -6,7 +6,7 @@ variable "cluster_name" { variable "cluster_domain_suffix" { type = string description = "Domain suffix for cluster" - default = "invalid" + default = "internal" } variable "cluster_net" { @@ -59,6 +59,14 @@ variable "compute" { image_id: Overrides variable cluster_image_id vnic_type: Overrides variable vnic_type vnic_profile: Overrides variable vnic_profile + compute_init_enable: Toggles compute-init rebuild (see compute-init role docs) + volume_backed_instances: Overrides variable volume_backed_instances + root_volume_size: Overrides variable root_volume_size + extra_volumes: Mapping defining additional volumes to create and attach + Keys are unique volume name. + Values are a mapping with: + size: Size of volume in GB + **NB**: The order in /dev is not guaranteed to match the mapping EOF } @@ -142,4 +150,4 @@ variable "root_volume_size" { variable "k3s_token" { description = "K3s cluster authentication token, set automatically by Ansible" type = string -} +} \ No newline at end of file diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 52202ead1..2ba0a1e63 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -23,6 +23,7 @@ data "git-commit" "cwd-head" { } locals { git_commit = data.git-commit.cwd-head.hash timestamp = formatdate("YYMMDD-hhmm", timestamp()) + image_name_version = var.image_name_version == "auto" ? "-${local.timestamp}-${substr(local.git_commit, 0, 8)}" : var.image_name_version } # Path pointing to root of repository - automatically set by environment variable PKR_VAR_repo_root @@ -39,12 +40,6 @@ variable "networks" { type = list(string) } -variable "os_version" { - type = string - description = "'RL8' or 'RL9' with default source_image_* mappings" - default = "RL9" -} - # Must supply either source_image_name or source_image_id variable "source_image_name" { type = string @@ -123,15 +118,6 @@ variable "volume_type" { } variable "volume_size" { - type = map(number) - default = { - # fat image builds, GB: - rocky-latest = 15 - openhpc = 15 - } -} - -variable "extra_build_volume_size" { type = number default = 15 } @@ -146,25 +132,22 @@ variable "metadata" { default = {} } -variable "groups" { - type = map(list(string)) - description = "Additional inventory groups (other than 'builder') to add build VM to, keyed by source name" - default = { - # fat image builds: - rocky-latest = ["update"] - openhpc = ["control", "compute", "login"] - } +variable "inventory_groups" { + type = string + description = "Comma-separated list of additional inventory groups (other than 'builder') to add build VM to. Default is none." + default = "" } -variable "extra_build_groups" { - type = list(string) - default = [] +variable "image_name" { + type = string + description = "Name of image" + default = "openhpc" } -variable "extra_build_image_name" { +variable "image_name_version" { type = string - description = "Infix for 'extra' build image name" - default = "extra" + description = "Suffix for image name giving version. Default of 'auto' appends timestamp + short commit" + default = "auto" } source "openstack" "openhpc" { @@ -172,9 +155,11 @@ source "openstack" "openhpc" { flavor = var.flavor use_blockstorage_volume = var.use_blockstorage_volume volume_type = var.volume_type - volume_size = lookup(var.volume_size, source.name, var.extra_build_volume_size) + volume_size = var.volume_size metadata = var.metadata - instance_metadata = {ansible_init_disable = "true"} + instance_metadata = { + ansible_init_disable = "true" + } networks = var.networks floating_ip_network = var.floating_ip_network security_groups = var.security_groups @@ -200,27 +185,13 @@ source "openstack" "openhpc" { build { - # latest nightly image: - source "source.openstack.openhpc" { - name = "rocky-latest" - image_name = "${source.name}-${var.os_version}" - } - - # fat image: - source "source.openstack.openhpc" { - name = "openhpc" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" - } - - # Extended site-specific image, built on fat image: source "source.openstack.openhpc" { - name = "openhpc-extra" - image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" + image_name = "${var.image_name}${local.image_name_version}" } provisioner "ansible" { playbook_file = "${var.repo_root}/ansible/fatimage.yml" - groups = concat(["builder"], lookup(var.groups, source.name, var.extra_build_groups)) + groups = concat(["builder"], var.inventory_groups == "" ? [] : split(",", var.inventory_groups)) keep_inventory_file = true # for debugging use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting extra_arguments = [ diff --git a/requirements.txt b/requirements.txt index 6651506fb..872ee9516 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ansible==6.0.0 +ansible==6.7.0 # cloudalchemy.prometheus uses ansible.builtin.include, removed in ansible-core==2.16 => ansible==9 openstacksdk python-openstackclient==6.6.1 # v7.0.0 has a bug re. rebuild python-manilaclient @@ -9,3 +9,4 @@ cookiecutter selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3 netaddr matplotlib +pulp-cli==0.23.2 diff --git a/requirements.yml b/requirements.yml index 3d8c44011..71adbc6e5 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ roles: - src: stackhpc.nfs version: v23.12.1 # Tolerate state nfs file handles - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v0.26.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/168 + version: v0.27.0 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc @@ -21,11 +21,11 @@ roles: version: v3.1.5 - src: https://github.com/stackhpc/ansible-role-os-manila-mount.git name: stackhpc.os-manila-mount - version: v24.11.0 # Support ceph quincy for RL9 + version: v25.1.1 collections: - name: containers.podman - version: 1.10.2 + version: 1.16.2 - name: community.grafana version: 1.5.4 - name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools @@ -49,4 +49,10 @@ collections: - name: https://github.com/azimuth-cloud/ansible-collection-image-utils type: git version: 0.4.0 + # stackhpc.pulp has pulp.squeezer as dependency, any version, but latest + # requires newer ansible than can install + - name: pulp.squeezer + version: 0.0.15 + - name: stackhpc.pulp + version: 0.5.5 ...