From 054cb73a3eb16d2f5aaa09dd07a08ff40d421395 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 29 Oct 2024 15:49:42 +0000 Subject: [PATCH 001/182] copy /etc/hosts to /exports/hosts/hosts --- ansible/roles/etc_hosts/tasks/main.yml | 23 +++++++++++++++++++ .../common/inventory/group_vars/all/nfs.yml | 6 +++++ 2 files changed, 29 insertions(+) diff --git a/ansible/roles/etc_hosts/tasks/main.yml b/ansible/roles/etc_hosts/tasks/main.yml index 6fdabf57c..1d04ebf7c 100644 --- a/ansible/roles/etc_hosts/tasks/main.yml +++ b/ansible/roles/etc_hosts/tasks/main.yml @@ -6,3 +6,26 @@ group: root mode: 0644 become: yes + +- name: Ensure /exports/hosts directory exists and copy /etc/hosts + block: + - name: Ensure the /exports/hosts directory exists + file: + path: /exports/hosts + state: directory + owner: root + group: root + mode: 0755 + become: yes + delegate_to: "{{ groups['control'] | first }}" + + - name: Copy /etc/hosts to NFS exported directory + copy: + src: /etc/hosts + dest: /exports/hosts/hosts + owner: root + group: root + mode: 0644 + remote_src: true + become: yes + delegate_to: "{{ groups['control'] | first }}" \ No newline at end of file diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index bd340b190..110a1383c 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -15,3 +15,9 @@ nfs_configurations: nfs_server: "{{ nfs_server_default }}" nfs_export: "/exports/home" # assumes skeleton TF is being used nfs_client_mnt_point: "/home" + + - comment: Export /etc/hosts copy from Slurm control node + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: false + nfs_export: "/exports/hosts" # control node has to copy in /etc/hosts to here \ No newline at end of file From 36de79680823f62779595f7cb086a9375f1811c7 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 31 Oct 2024 11:51:13 +0000 Subject: [PATCH 002/182] add resolv_conf role to compute script --- ansible/.gitignore | 2 + ansible/extras.yml | 8 +++ .../roles/compute_init/files/compute-init.yml | 59 +++++++++++++++++++ ansible/roles/compute_init/tasks/main.yml | 45 ++++++++++++++ environments/common/inventory/groups | 5 +- environments/common/layouts/everything | 6 +- 6 files changed, 123 insertions(+), 2 deletions(-) create mode 100644 ansible/roles/compute_init/files/compute-init.yml create mode 100644 ansible/roles/compute_init/tasks/main.yml diff --git a/ansible/.gitignore b/ansible/.gitignore index 2ceeb596b..677b4c31f 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -58,4 +58,6 @@ roles/* !roles/squid/** !roles/tuned/ !roles/tuned/** +!roles/compute_init/ +!roles/compute_init/** diff --git a/ansible/extras.yml b/ansible/extras.yml index c32f51c32..18bad1dfd 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -36,3 +36,11 @@ tasks: - import_role: name: persist_hostkeys + +- name: Inject ansible-init compute script + hosts: compute_init + tags: compute_init + become: yes + tasks: + - import_role: + name: compute_init \ No newline at end of file diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml new file mode 100644 index 000000000..ce797c1cf --- /dev/null +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -0,0 +1,59 @@ +--- + +- name: Compute node initialisation + hosts: localhost + become: yes + vars: + control_node_ip: "172.16.1.228" + nfs_export: "/exports/hosts" + resolv_conf_nameservers: [] + + tasks: + - name: Configure resolve.conf + block: + - name: Set nameservers in /etc/resolv.conf + ansible.builtin.template: + src: /etc/ansible-init/templates/resolv.conf.j2 + dest: /etc/resolv.conf + owner: root + group: root + mode: u=rw,og=r + + - name: Disable NetworkManager control of resolv.conf + ansible.builtin.copy: + src: /etc/ansible-init/files/NetworkManager-dns-none.conf + dest: /etc/NetworkManager/conf.d/90-dns-none.conf + owner: root + group: root + mode: u=rw,og=r + register: _copy_nm_config + + - name: Reload NetworkManager + ansible.builtin.systemd: + name: NetworkManager + state: reloaded + when: _copy_nm_config.changed | default(false) + + - name: Mount /etc/hosts on compute nodes + block: + - name: Ensure the mount directory exists + file: + path: /mnt/hosts + state: directory + mode: 0755 + + - name: Mount NFS export + mount: + path: /mnt/hosts + src: "{{ vars.control_node_ip }}:{{ nfs_export }}" + fstype: nfs + opts: rw,sync + state: mounted + + - name: Copy /exports/hosts contents to /etc/hosts + copy: + src: /mnt/hosts/hosts + dest: /etc/hosts + owner: root + group: root + mode: 0644 diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml new file mode 100644 index 000000000..49b7d37e8 --- /dev/null +++ b/ansible/roles/compute_init/tasks/main.yml @@ -0,0 +1,45 @@ +--- + +- name: Ensure templates directory exists + file: + path: /etc/ansible-init/templates + state: directory + owner: root + group: root + mode: 0644 + +- name: Inject templates + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/templates/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../resolv_conf/templates/resolv.conf.j2 + +- name: Ensure files directory exists + file: + path: /etc/ansible-init/files + state: directory + owner: root + group: root + mode: 0644 + +- name: Inject files + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/files/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../resolv_conf/files/NetworkManager-dns-none.conf + +- name: Inject compute initialisation playbook + copy: + src: compute-init.yml + dest: /etc/ansible-init/playbooks/compute-init.yml + owner: root + group: root + mode: 0644 \ No newline at end of file diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index ea0bebebc..62a1fb0d2 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -134,4 +134,7 @@ freeipa_client # Hosts to run TuneD configuration [ansible_init] -# Hosts to run linux-anisble-init \ No newline at end of file +# Hosts to run linux-anisble-init + +[compute_init] +# Hosts to deploy compute initialisation ansible-init script to. \ No newline at end of file diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 205f1d334..19880ddef 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -80,4 +80,8 @@ openhpc [ansible_init:children] # Hosts to run ansible-init -cluster \ No newline at end of file +cluster + +[compute_init:children] +# Hosts to deploy compute initialisation ansible-init script to. +compute \ No newline at end of file From c1065b3dcd4ed8a460f078e1ab8a23fe7e7afabb Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 6 Nov 2024 11:47:14 +0000 Subject: [PATCH 003/182] add manila to compute script --- .../roles/compute_init/files/compute-init.yml | 171 +++++++++++++++++- ansible/roles/compute_init/tasks/main.yml | 48 +++++ .../common/inventory/group_vars/all/nfs.yml | 11 +- 3 files changed, 223 insertions(+), 7 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index ce797c1cf..179b56fbf 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -5,8 +5,55 @@ become: yes vars: control_node_ip: "172.16.1.228" - nfs_export: "/exports/hosts" - resolv_conf_nameservers: [] + nfs_export_hosts: "/exports/hosts" + resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] + + # block device (disk) on which to create the exported filesystem. + # if the disk is not defined, formatting and mounting will not be done. + nfs_disk_location: + + # Path to exported filesystem mountpoint on nfs servers + nfs_export: "/exports/home" + + # nfs client mount options + nfs_client_mnt_options: + + # Path to mountpoint on nfs clients + nfs_client_mnt_point: "/home" + nfs_client_mnt_state: mounted + + nfs_server: "{{ control_node_ip }}" + + + os_manila_mount_shares: [] + os_manila_mount_state: mounted + os_manila_mount_opts: + - x-systemd.device-timeout=30 + - x-systemd.mount-timeout=30 + - noatime + - _netdev # prevents mount blocking early boot before networking available + - rw + os_manila_mount_share_info: [] # populated by lookup mode + os_manila_mount_ceph_conf_path: /etc/ceph + + + basic_users_manage_homedir: false + + basic_users_userdefaults: + state: present + create_home: "{{ basic_users_manage_homedir }}" + generate_ssh_key: "{{ basic_users_manage_homedir }}" + ssh_key_comment: "{{ item.name }}" + + test_user_password: "zXpcWyGQL7jtZnqylQra4g==" + + basic_users_users: + - name: testuser # can't use rocky as $HOME isn't shared! + password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent + uid: 1005 + state: present + + basic_users_groups: [] tasks: - name: Configure resolve.conf @@ -34,6 +81,7 @@ state: reloaded when: _copy_nm_config.changed | default(false) + - name: Mount /etc/hosts on compute nodes block: - name: Ensure the mount directory exists @@ -42,18 +90,131 @@ state: directory mode: 0755 - - name: Mount NFS export + - name: Mount /mnt/hosts mount: path: /mnt/hosts - src: "{{ vars.control_node_ip }}:{{ nfs_export }}" + src: "{{ vars.control_node_ip }}:{{ nfs_export_hosts }}" fstype: nfs opts: rw,sync state: mounted - - name: Copy /exports/hosts contents to /etc/hosts + - name: Copy /mnt/hosts/hosts contents to /etc/hosts copy: src: /mnt/hosts/hosts dest: /etc/hosts owner: root group: root mode: 0644 + + + - name: NFS client mount + block: + - name: ensure mount directory exists + file: + path: "{{ nfs_client_mnt_point }}" + state: directory + + - name: mount the filesystem + mount: + path: "{{ nfs_client_mnt_point }}" + src: "{{ nfs_server }}:{{ nfs_export }}" + fstype: nfs + state: "{{ nfs_client_mnt_state }}" + + + - name: Manila mount + block: + - name: Read manila share from nfs file + slurp: + src: "/mnt/cluster/manila_share_info.yml" + register: manila_share_info_file + + - name: Parse and set fact for manila share info + set_fact: + os_manila_mount_share_info: "{{ manila_share_info_file.content | b64decode | from_yaml }}" + + - name: Ensure Ceph configuration directory exists + ansible.builtin.file: + path: "{{ os_manila_mount_ceph_conf_path }}" + state: directory + mode: "0755" + owner: root + group: root + + - name: Configure ceph.conf using os_manila_mount_host + ansible.builtin.template: + src: /etc/ansible-init/templates/ceph.conf.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.conf" + owner: root + group: root + mode: "0600" + + - name: Ensure mount directory exists + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Write Ceph client keyring + ansible.builtin.template: + src: /etc/ansible-init/templates/ceph.keyring.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.client.{{ item.share_user }}.keyring" + mode: "0600" + owner: root + group: root + loop: "{{ os_manila_mount_share_info }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Mount the Ceph share + ansible.posix.mount: + path: "{{ item[0].mount_path }}" + src: "{{ item[1].host }}:{{ item[1].export }}" + fstype: ceph + opts: "name={{ item[1].share_user }},{{ (item[0].mount_opts | default(os_manila_mount_opts)) | join(',') }}" + # NB share_user is looked up here in case of autodetection + state: "{{ item[0].mount_state | default(os_manila_mount_state) }}" + loop: "{{ os_manila_mount_shares | zip(os_manila_mount_share_info) }}" + loop_control: + label: "{{ item[0].share_name }}" + + - name: Ensure mounted directory has correct permissions + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] + + + - name: Basic users setup + block: + - name: Create groups + ansible.builtin.group: "{{ item }}" + loop: "{{ basic_users_groups }}" + + - name: Create users + user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}" + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }} [{{ item.state | default('present') }}]" + register: basic_users_info + + - name: Write sudo rules + blockinfile: + path: /etc/sudoers.d/80-{{ item.name}}-user + block: "{{ item.sudo }}" + create: true + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }}" + when: "'sudo' in item" \ No newline at end of file diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index 49b7d37e8..812ed84ff 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -17,6 +17,8 @@ mode: 0644 loop: - ../../resolv_conf/templates/resolv.conf.j2 + - ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 + - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 - name: Ensure files directory exists file: @@ -36,6 +38,52 @@ loop: - ../../resolv_conf/files/NetworkManager-dns-none.conf +- name: Ensure library directory exists + file: + path: /etc/ansible-init/library + state: directory + owner: root + group: root + mode: 0644 + +- name: Inject files + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/library/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../basic_users/library/terminate_user_sessions.py + - ../../stackhpc.os-manila-mount/library/os_manila_share.py + +- name: Ensure filter_plugins directory exists + file: + path: /etc/ansible-init/filter_plugins + state: directory + owner: root + group: root + mode: 0644 + +- name: Inject filter_plugins + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/filter_plugins/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../basic_users/filter_plugins/filter_keys.py + +- name: Add filter_plugins ansible.cfg + lineinfile: + path: /etc/ansible-init/ansible.cfg + line: "filter_plugins = /etc/ansible-init/filter_plugins" + state: present + owner: root + group: root + mode: 0644 + - name: Inject compute initialisation playbook copy: src: compute-init.yml diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index 110a1383c..036850847 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -18,6 +18,13 @@ nfs_configurations: - comment: Export /etc/hosts copy from Slurm control node nfs_enable: - server: "{{ inventory_hostname in groups['control'] }}" + server: "{{ inventory_hostname in groups['control'] }}" clients: false - nfs_export: "/exports/hosts" # control node has to copy in /etc/hosts to here \ No newline at end of file + nfs_export: "/exports/hosts" # control node has to copy in /etc/hosts to here + + - comment: Export cluster info from control node + nfs_enable: + server: "{{ inventory_hostname in groups['control']}}" + clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}" + nfs_server: "{{ nfs_server_default }}" + nfs_export: "/exports/cluster" \ No newline at end of file From fce13ede9a62884be2def320fc37aee07ea559d5 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 6 Nov 2024 15:02:55 +0000 Subject: [PATCH 004/182] Compute script: configure EESSI --- .../roles/compute_init/files/compute-init.yml | 70 ++++++++++++++++--- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 179b56fbf..23f865f52 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -11,17 +11,13 @@ # block device (disk) on which to create the exported filesystem. # if the disk is not defined, formatting and mounting will not be done. nfs_disk_location: - # Path to exported filesystem mountpoint on nfs servers nfs_export: "/exports/home" - # nfs client mount options nfs_client_mnt_options: - # Path to mountpoint on nfs clients nfs_client_mnt_point: "/home" nfs_client_mnt_state: mounted - nfs_server: "{{ control_node_ip }}" @@ -38,23 +34,29 @@ basic_users_manage_homedir: false - basic_users_userdefaults: state: present create_home: "{{ basic_users_manage_homedir }}" generate_ssh_key: "{{ basic_users_manage_homedir }}" ssh_key_comment: "{{ item.name }}" - test_user_password: "zXpcWyGQL7jtZnqylQra4g==" - basic_users_users: - name: testuser # can't use rocky as $HOME isn't shared! password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent uid: 1005 state: present - basic_users_groups: [] + + # Default to 10GB + cvmfs_quota_limit_mb: 10000 + cvmfs_config_default: + CVMFS_CLIENT_PROFILE: single + CVMFS_QUOTA_LIMIT: "{{ cvmfs_quota_limit_mb }}" + cvmfs_config_overrides: {} + cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" + + tasks: - name: Configure resolve.conf block: @@ -217,4 +219,54 @@ loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" - when: "'sudo' in item" \ No newline at end of file + when: "'sudo' in item" + + + - name: Configure EESSI + gather_facts: false + block: + - name: Download Cern GPG key + ansible.builtin.get_url: + url: http://cvmrepo.web.cern.ch/cvmrepo/yum/RPM-GPG-KEY-CernVM + dest: ./cvmfs-key.gpg + + - name: Import downloaded GPG key + command: rpm --import cvmfs-key.gpg + + - name: Add CVMFS repo + dnf: + name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm + + - name: Install CVMFS + dnf: + name: cvmfs + + - name: Install EESSI CVMFS config + dnf: + name: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi-latest.noarch.rpm + # NOTE: Can't find any docs on obtaining gpg key - maybe downloading directly from github is ok? + disable_gpg_check: true + + # Alternative version using official repo - still no GPG key :( + # - name: Add EESSI repo + # dnf: + # name: http://repo.eessi-infra.org/eessi/rhel/8/noarch/eessi-release-0-1.noarch.rpm + + # - name: Install EESSI CVMFS config + # dnf: + # name: cvmfs-config-eessi + + - name: Add base CVMFS config + community.general.ini_file: + dest: /etc/cvmfs/default.local + section: null + option: "{{ item.key }}" + value: "{{ item.value }}" + no_extra_spaces: true + loop: "{{ cvmfs_config | dict2items }}" + + + # NOTE: Not clear how to make this idempotent + - name: Ensure CVMFS config is setup + command: + cmd: "cvmfs_config setup" \ No newline at end of file From 36f1e170f04fad5ca68a4fff6e876c3bd5b9780a Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 7 Nov 2024 13:19:51 +0000 Subject: [PATCH 005/182] testing openhpc in compute script --- .../roles/compute_init/files/compute-init.yml | 221 +++++++++++++++++- ansible/roles/compute_init/tasks/main.yml | 2 + 2 files changed, 221 insertions(+), 2 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 23f865f52..bdbe8ab08 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -264,9 +264,226 @@ value: "{{ item.value }}" no_extra_spaces: true loop: "{{ cvmfs_config | dict2items }}" - # NOTE: Not clear how to make this idempotent - name: Ensure CVMFS config is setup command: - cmd: "cvmfs_config setup" \ No newline at end of file + cmd: "cvmfs_config setup" + + + - name: Configure openhpc + block: + - name: Check openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions exist + assert: + that: + - openhpc_slurm_control_host is defined + - openhpc_cluster_name is defined + - openhpc_cluster_name != '' + - openhpc_slurm_partitions is defined + fail_msg: "Undefined openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions." + + - name: Fail if control host not in play and munge key not specified + fail: + msg: "Either the slurm control node must be in the play or `openhpc_munge_key` must be set" + when: + - openhpc_slurm_control_host not in ansible_play_hosts + - not openhpc_munge_key + + # - name: Ensure Slurm directories exists + # file: + # path: "{{ openhpc_state_save_location }}" + # owner: slurm + # group: slurm + # mode: 0755 + # state: directory + # when: inventory_hostname == openhpc_slurm_control_host + + # - name: Generate a Munge key on control host + # # NB this is usually a no-op as the package install actually generates a (node-unique) one, so won't usually trigger handler + # command: "dd if=/dev/urandom of=/etc/munge/munge.key bs=1 count=1024" + # args: + # creates: "/etc/munge/munge.key" + # when: inventory_hostname == openhpc_slurm_control_host + + # - name: Retrieve Munge key from control host + # slurp: + # src: "/etc/munge/munge.key" + # register: openhpc_control_munge_key + # delegate_to: "{{ openhpc_slurm_control_host }}" + # when: openhpc_slurm_control_host in ansible_play_hosts + + - name: Fix permissions on /etc to pass Munge startup checks + # Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 makes /etc g=rwx rather than g=rx (where group=root) + # which fails munged startup checks + file: + path: /etc + state: directory + mode: g-w + + - name: Write Munge key + copy: + content: "{{ openhpc_munge_key or (openhpc_control_munge_key.content | b64decode) }}" + dest: "/etc/munge/munge.key" + owner: munge + group: munge + mode: 0400 + notify: + - Restart Munge service + + - name: Ensure JobComp logfile exists + file: + path: "{{ openhpc_slurm_job_comp_loc }}" + state: touch + owner: slurm + group: slurm + mode: 0644 + access_time: preserve + modification_time: preserve + when: openhpc_slurm_job_comp_type == 'jobcomp/filetxt' + + - name: Template slurmdbd.conf + template: + src: slurmdbd.conf.j2 + dest: /etc/slurm/slurmdbd.conf + mode: "0600" + owner: slurm + group: slurm + notify: Restart slurmdbd service + when: openhpc_enable.database | default(false) | bool + + - name: Make local tempfile for slurm.conf templating # ensures simultaneous runs don't clobber each other + ansible.builtin.tempfile: + register: _slurm_conf_tmpfile + delegate_to: localhost + when: openhpc_enable.control | default(false) or not openhpc_slurm_configless + changed_when: false # so molecule doesn't fail + become: no + + - name: Template basic slurm.conf + template: + src: slurm.conf.j2 + dest: "{{ _slurm_conf_tmpfile.path }}" + lstrip_blocks: true + mode: 0644 + delegate_to: localhost + when: openhpc_enable.control | default(false) or not openhpc_slurm_configless + changed_when: false # so molecule doesn't fail + become: no + + - name: Customise slurm.conf + community.general.ini_file: + path: "{{ _slurm_conf_tmpfile.path }}" + option: "{{ item.key }}" + section: '' + value: "{{ (item.value | join(',')) if (item.value is sequence and item.value is not string) else item.value }}" + no_extra_spaces: true + create: no + mode: 0644 + loop: "{{ openhpc_config | dict2items }}" + delegate_to: localhost + when: openhpc_enable.control | default(false) or not openhpc_slurm_configless + changed_when: false # so molecule doesn't fail + become: no + + - name: Create slurm.conf + copy: + src: "{{ _slurm_conf_tmpfile.path }}" + dest: /etc/slurm/slurm.conf + owner: root + group: root + mode: 0644 + when: openhpc_enable.control | default(false) or not openhpc_slurm_configless + notify: + - Restart slurmctld service + register: ohpc_slurm_conf + # NB uses restart rather than reload as number of nodes might have changed + + - name: Create gres.conf + template: + src: "{{ openhpc_gres_template }}" + dest: /etc/slurm/gres.conf + mode: "0600" + owner: slurm + group: slurm + when: openhpc_enable.control | default(false) or not openhpc_slurm_configless + notify: + - Restart slurmctld service + register: ohpc_gres_conf + # NB uses restart rather than reload as this is needed in some cases + + - name: Template cgroup.conf + # appears to be required even with NO cgroup plugins: https://slurm.schedmd.com/cgroups.html#cgroup_design + template: + src: cgroup.conf.j2 + dest: /etc/slurm/cgroup.conf + mode: "0644" # perms/ownership based off src from ohpc package + owner: root + group: root + when: openhpc_enable.control | default(false) or not openhpc_slurm_configless + + - name: Remove local tempfile for slurm.conf templating + ansible.builtin.file: + path: "{{ _slurm_conf_tmpfile.path }}" + state: absent + when: _slurm_conf_tmpfile.path is defined + delegate_to: localhost + changed_when: false # so molecule doesn't fail + become: no + + - name: Notify handler for slurmd restart + debug: + msg: "notifying handlers" # meta: noop doesn't support 'when' + changed_when: true + when: + - openhpc_slurm_control_host in ansible_play_hosts + - hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler + notify: + - Restart slurmd service + + - name: Set slurmctld location for configless operation + lineinfile: + path: /etc/sysconfig/slurmd + line: "SLURMD_OPTIONS='--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}'" + regexp: "^SLURMD_OPTIONS=" + create: yes + owner: root + group: root + mode: 0644 + when: + - openhpc_enable.batch | default(false) + - openhpc_slurm_configless + notify: + - Restart slurmd service + # Reloading is sufficent, but using a single handler means no bounce. Realistically this won't regularly change on a running slurmd so restarting is ok. + + # Munge state could be unchanged but the service is not running. + # Handle that here. + - name: Configure Munge service + service: + name: munge + enabled: "{{ openhpc_slurm_service_enabled | bool }}" + state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" + + - name: Flush handler + meta: flush_handlers # as then subsequent "ensure" is a no-op if slurm services bounced + + - name: Ensure slurmdbd state + service: + name: slurmdbd + enabled: "{{ openhpc_slurm_service_enabled | bool }}" + state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" + when: openhpc_enable.database | default(false) | bool + + # - name: Ensure slurmctld state + # service: + # name: slurmctld + # enabled: "{{ openhpc_slurm_service_enabled | bool }}" + # state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" + # when: openhpc_enable.control | default(false) | bool + + - name: Ensure slurmd state + service: + name: slurmd + enabled: "{{ openhpc_slurm_service_enabled | bool }}" + state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" + when: openhpc_enable.batch | default(false) | bool \ No newline at end of file diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index 812ed84ff..d236c5e57 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -56,6 +56,7 @@ loop: - ../../basic_users/library/terminate_user_sessions.py - ../../stackhpc.os-manila-mount/library/os_manila_share.py + - ../../stackhpc.openhpc/library/sacct_cluster.py - name: Ensure filter_plugins directory exists file: @@ -74,6 +75,7 @@ mode: 0644 loop: - ../../basic_users/filter_plugins/filter_keys.py + - ../../stackhpc.openhpc/filter_plugins/slurm_conf.py - name: Add filter_plugins ansible.cfg lineinfile: From 8930d388fa9cacd206abd48563f9996daea11dd8 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 12 Nov 2024 13:51:28 +0000 Subject: [PATCH 006/182] finish transferring openhpc tasks to compute script --- .../roles/compute_init/files/compute-init.yml | 195 ++---------------- ansible/roles/compute_init/tasks/main.yml | 21 ++ 2 files changed, 35 insertions(+), 181 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index bdbe8ab08..aaee0718b 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -56,6 +56,15 @@ cvmfs_config_overrides: {} cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" + openhpc_conf_server: control_node_ip + openhpc_gres_template: /etc/ansible-init/templates/gres.conf.j2 + openhpc_slurm_service_enabled: true + openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}" + openhpc_enable: + control: false + batch: true + database: false + runtime: true tasks: - name: Configure resolve.conf @@ -223,7 +232,6 @@ - name: Configure EESSI - gather_facts: false block: - name: Download Cern GPG key ansible.builtin.get_url: @@ -247,15 +255,6 @@ # NOTE: Can't find any docs on obtaining gpg key - maybe downloading directly from github is ok? disable_gpg_check: true - # Alternative version using official repo - still no GPG key :( - # - name: Add EESSI repo - # dnf: - # name: http://repo.eessi-infra.org/eessi/rhel/8/noarch/eessi-release-0-1.noarch.rpm - - # - name: Install EESSI CVMFS config - # dnf: - # name: cvmfs-config-eessi - - name: Add base CVMFS config community.general.ini_file: dest: /etc/cvmfs/default.local @@ -273,45 +272,6 @@ - name: Configure openhpc block: - - name: Check openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions exist - assert: - that: - - openhpc_slurm_control_host is defined - - openhpc_cluster_name is defined - - openhpc_cluster_name != '' - - openhpc_slurm_partitions is defined - fail_msg: "Undefined openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions." - - - name: Fail if control host not in play and munge key not specified - fail: - msg: "Either the slurm control node must be in the play or `openhpc_munge_key` must be set" - when: - - openhpc_slurm_control_host not in ansible_play_hosts - - not openhpc_munge_key - - # - name: Ensure Slurm directories exists - # file: - # path: "{{ openhpc_state_save_location }}" - # owner: slurm - # group: slurm - # mode: 0755 - # state: directory - # when: inventory_hostname == openhpc_slurm_control_host - - # - name: Generate a Munge key on control host - # # NB this is usually a no-op as the package install actually generates a (node-unique) one, so won't usually trigger handler - # command: "dd if=/dev/urandom of=/etc/munge/munge.key bs=1 count=1024" - # args: - # creates: "/etc/munge/munge.key" - # when: inventory_hostname == openhpc_slurm_control_host - - # - name: Retrieve Munge key from control host - # slurp: - # src: "/etc/munge/munge.key" - # register: openhpc_control_munge_key - # delegate_to: "{{ openhpc_slurm_control_host }}" - # when: openhpc_slurm_control_host in ansible_play_hosts - - name: Fix permissions on /etc to pass Munge startup checks # Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 makes /etc g=rwx rather than g=rx (where group=root) # which fails munged startup checks @@ -320,83 +280,13 @@ state: directory mode: g-w - - name: Write Munge key + - name: Copy Munge key from NFS-mounted directory to /etc/munge copy: - content: "{{ openhpc_munge_key or (openhpc_control_munge_key.content | b64decode) }}" + src: "/mnt/openhpc_munge.key" dest: "/etc/munge/munge.key" owner: munge group: munge mode: 0400 - notify: - - Restart Munge service - - - name: Ensure JobComp logfile exists - file: - path: "{{ openhpc_slurm_job_comp_loc }}" - state: touch - owner: slurm - group: slurm - mode: 0644 - access_time: preserve - modification_time: preserve - when: openhpc_slurm_job_comp_type == 'jobcomp/filetxt' - - - name: Template slurmdbd.conf - template: - src: slurmdbd.conf.j2 - dest: /etc/slurm/slurmdbd.conf - mode: "0600" - owner: slurm - group: slurm - notify: Restart slurmdbd service - when: openhpc_enable.database | default(false) | bool - - - name: Make local tempfile for slurm.conf templating # ensures simultaneous runs don't clobber each other - ansible.builtin.tempfile: - register: _slurm_conf_tmpfile - delegate_to: localhost - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - changed_when: false # so molecule doesn't fail - become: no - - - name: Template basic slurm.conf - template: - src: slurm.conf.j2 - dest: "{{ _slurm_conf_tmpfile.path }}" - lstrip_blocks: true - mode: 0644 - delegate_to: localhost - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - changed_when: false # so molecule doesn't fail - become: no - - - name: Customise slurm.conf - community.general.ini_file: - path: "{{ _slurm_conf_tmpfile.path }}" - option: "{{ item.key }}" - section: '' - value: "{{ (item.value | join(',')) if (item.value is sequence and item.value is not string) else item.value }}" - no_extra_spaces: true - create: no - mode: 0644 - loop: "{{ openhpc_config | dict2items }}" - delegate_to: localhost - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - changed_when: false # so molecule doesn't fail - become: no - - - name: Create slurm.conf - copy: - src: "{{ _slurm_conf_tmpfile.path }}" - dest: /etc/slurm/slurm.conf - owner: root - group: root - mode: 0644 - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - notify: - - Restart slurmctld service - register: ohpc_slurm_conf - # NB uses restart rather than reload as number of nodes might have changed - name: Create gres.conf template: @@ -405,82 +295,25 @@ mode: "0600" owner: slurm group: slurm - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - notify: - - Restart slurmctld service + when: openhpc_enable.control | default(false) register: ohpc_gres_conf - # NB uses restart rather than reload as this is needed in some cases - - - name: Template cgroup.conf - # appears to be required even with NO cgroup plugins: https://slurm.schedmd.com/cgroups.html#cgroup_design - template: - src: cgroup.conf.j2 - dest: /etc/slurm/cgroup.conf - mode: "0644" # perms/ownership based off src from ohpc package - owner: root - group: root - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - - - name: Remove local tempfile for slurm.conf templating - ansible.builtin.file: - path: "{{ _slurm_conf_tmpfile.path }}" - state: absent - when: _slurm_conf_tmpfile.path is defined - delegate_to: localhost - changed_when: false # so molecule doesn't fail - become: no - - - name: Notify handler for slurmd restart - debug: - msg: "notifying handlers" # meta: noop doesn't support 'when' - changed_when: true - when: - - openhpc_slurm_control_host in ansible_play_hosts - - hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler - notify: - - Restart slurmd service - name: Set slurmctld location for configless operation lineinfile: path: /etc/sysconfig/slurmd - line: "SLURMD_OPTIONS='--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}'" + line: "SLURMD_OPTIONS='--conf-server {{ openhpc_conf_server }}'" regexp: "^SLURMD_OPTIONS=" create: yes owner: root group: root mode: 0644 - when: - - openhpc_enable.batch | default(false) - - openhpc_slurm_configless - notify: - - Restart slurmd service - # Reloading is sufficent, but using a single handler means no bounce. Realistically this won't regularly change on a running slurmd so restarting is ok. - - # Munge state could be unchanged but the service is not running. - # Handle that here. + - name: Configure Munge service service: name: munge enabled: "{{ openhpc_slurm_service_enabled | bool }}" state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" - - name: Flush handler - meta: flush_handlers # as then subsequent "ensure" is a no-op if slurm services bounced - - - name: Ensure slurmdbd state - service: - name: slurmdbd - enabled: "{{ openhpc_slurm_service_enabled | bool }}" - state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" - when: openhpc_enable.database | default(false) | bool - - # - name: Ensure slurmctld state - # service: - # name: slurmctld - # enabled: "{{ openhpc_slurm_service_enabled | bool }}" - # state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" - # when: openhpc_enable.control | default(false) | bool - - name: Ensure slurmd state service: name: slurmd diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index d236c5e57..c5f884081 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -19,6 +19,7 @@ - ../../resolv_conf/templates/resolv.conf.j2 - ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 + - ../../stackhpc.openhpc/templates/gres.conf.j2 - name: Ensure files directory exists file: @@ -86,6 +87,26 @@ group: root mode: 0644 +- name: Ensure /exports/cluster directory exists + file: + path: /exports/cluster + state: directory + owner: root + group: root + mode: 0644 + delegate_to: "{{ groups['control'] | first }}" + +- name: Write openhpc munge key + copy: + content: "{{ vault_openhpc_mungekey | b64decode }}" + dest: "/exports/cluster/openhpc_munge.key" + owner: munge + group: munge + mode: 0400 + become: true + delegate_to: "{{ groups['control'] | first }}" + + - name: Inject compute initialisation playbook copy: src: compute-init.yml From 4e4f20635e2435096bd8b810887f4d860a0e9a9d Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 12 Nov 2024 16:32:07 +0000 Subject: [PATCH 007/182] move manila share info mount to compute_init role --- ansible/roles/compute_init/files/compute-init.yml | 10 +++------- ansible/roles/compute_init/tasks/main.yml | 14 +++++++++++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index aaee0718b..5c4bd0005 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -3,19 +3,16 @@ - name: Compute node initialisation hosts: localhost become: yes + # VARS TO BE SUPPLIED VIA CLOUD INIT METADATA vars: control_node_ip: "172.16.1.228" nfs_export_hosts: "/exports/hosts" resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] - # block device (disk) on which to create the exported filesystem. - # if the disk is not defined, formatting and mounting will not be done. + nfs_disk_location: - # Path to exported filesystem mountpoint on nfs servers nfs_export: "/exports/home" - # nfs client mount options nfs_client_mnt_options: - # Path to mountpoint on nfs clients nfs_client_mnt_point: "/home" nfs_client_mnt_state: mounted nfs_server: "{{ control_node_ip }}" @@ -48,7 +45,6 @@ basic_users_groups: [] - # Default to 10GB cvmfs_quota_limit_mb: 10000 cvmfs_config_default: CVMFS_CLIENT_PROFILE: single @@ -137,7 +133,7 @@ block: - name: Read manila share from nfs file slurp: - src: "/mnt/cluster/manila_share_info.yml" + src: "/mnt/manila_share_info.yml" register: manila_share_info_file - name: Parse and set fact for manila share info diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index c5f884081..bcc4db800 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -96,6 +96,19 @@ mode: 0644 delegate_to: "{{ groups['control'] | first }}" +- name: Ensure /exports/cluster exists on control node + ansible.builtin.file: + path: /exports/cluster + state: directory + mode: '0755' + delegate_to: "{{ groups['control'] | first }}" + +- name: Copy manila share info to /exports/cluster + copy: + content: "{{ os_manila_mount_share_info | to_nice_yaml }}" + dest: "/exports/cluster/manila_share_info.yml" + delegate_to: "{{ groups['control'] | first }}" + - name: Write openhpc munge key copy: content: "{{ vault_openhpc_mungekey | b64decode }}" @@ -103,7 +116,6 @@ owner: munge group: munge mode: 0400 - become: true delegate_to: "{{ groups['control'] | first }}" From fda2d312fd5724e1db23578ba3b03a827b1151c9 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 13 Nov 2024 13:16:18 +0000 Subject: [PATCH 008/182] fix mounts --- ansible/filesystems.yml | 19 ++++++++++++++++ .../roles/compute_init/files/compute-init.yml | 15 +++++++++++++ ansible/roles/compute_init/tasks/main.yml | 22 ------------------- 3 files changed, 34 insertions(+), 22 deletions(-) diff --git a/ansible/filesystems.yml b/ansible/filesystems.yml index e1a782bad..316ae23d8 100644 --- a/ansible/filesystems.yml +++ b/ansible/filesystems.yml @@ -24,3 +24,22 @@ tasks: - include_role: name: stackhpc.os-manila-mount + +- name: Manage /exports/cluster and Manila share info + hosts: control + become: true + tasks: + - block: + - name: Ensure /exports/cluster directory exists + file: + path: /exports/cluster + state: directory + owner: root + group: root + mode: 0755 + + - name: Copy manila share info to /exports/cluster + copy: + content: "{{ os_manila_mount_share_info | to_nice_yaml }}" + dest: "/exports/cluster/manila_share_info.yml" + when: os_manila_mount_share_info is defined \ No newline at end of file diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 5c4bd0005..0d163dbf3 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -7,6 +7,7 @@ vars: control_node_ip: "172.16.1.228" nfs_export_hosts: "/exports/hosts" + nfs_export_cluster: "/exports/cluster" resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] @@ -127,6 +128,20 @@ src: "{{ nfs_server }}:{{ nfs_export }}" fstype: nfs state: "{{ nfs_client_mnt_state }}" + + - name: Ensure the mount directory exists + file: + path: /mnt/ + state: directory + mode: 0755 + + - name: Mount /mnt/ + mount: + path: /mnt/ + src: "{{ vars.control_node_ip }}:{{ nfs_export_cluster }}" + fstype: nfs + opts: rw,sync + state: mounted - name: Manila mount diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index bcc4db800..892b8b093 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -87,28 +87,6 @@ group: root mode: 0644 -- name: Ensure /exports/cluster directory exists - file: - path: /exports/cluster - state: directory - owner: root - group: root - mode: 0644 - delegate_to: "{{ groups['control'] | first }}" - -- name: Ensure /exports/cluster exists on control node - ansible.builtin.file: - path: /exports/cluster - state: directory - mode: '0755' - delegate_to: "{{ groups['control'] | first }}" - -- name: Copy manila share info to /exports/cluster - copy: - content: "{{ os_manila_mount_share_info | to_nice_yaml }}" - dest: "/exports/cluster/manila_share_info.yml" - delegate_to: "{{ groups['control'] | first }}" - - name: Write openhpc munge key copy: content: "{{ vault_openhpc_mungekey | b64decode }}" From 998ebf184445df2c9239d31ffa535702eda1d849 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 13 Nov 2024 16:46:46 +0000 Subject: [PATCH 009/182] address review comments --- ansible/filesystems.yml | 21 +----- .../roles/compute_init/files/compute-init.yml | 36 +++------ ansible/roles/compute_init/tasks/main.yml | 75 ++++++++++--------- ansible/roles/etc_hosts/tasks/main.yml | 23 ------ .../common/inventory/group_vars/all/nfs.yml | 11 +-- 5 files changed, 53 insertions(+), 113 deletions(-) diff --git a/ansible/filesystems.yml b/ansible/filesystems.yml index 316ae23d8..cf0db407f 100644 --- a/ansible/filesystems.yml +++ b/ansible/filesystems.yml @@ -23,23 +23,4 @@ tags: manila tasks: - include_role: - name: stackhpc.os-manila-mount - -- name: Manage /exports/cluster and Manila share info - hosts: control - become: true - tasks: - - block: - - name: Ensure /exports/cluster directory exists - file: - path: /exports/cluster - state: directory - owner: root - group: root - mode: 0755 - - - name: Copy manila share info to /exports/cluster - copy: - content: "{{ os_manila_mount_share_info | to_nice_yaml }}" - dest: "/exports/cluster/manila_share_info.yml" - when: os_manila_mount_share_info is defined \ No newline at end of file + name: stackhpc.os-manila-mount \ No newline at end of file diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 0d163dbf3..00043f0e8 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -6,8 +6,6 @@ # VARS TO BE SUPPLIED VIA CLOUD INIT METADATA vars: control_node_ip: "172.16.1.228" - nfs_export_hosts: "/exports/hosts" - nfs_export_cluster: "/exports/cluster" resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] @@ -53,7 +51,7 @@ cvmfs_config_overrides: {} cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" - openhpc_conf_server: control_node_ip + openhpc_conf_server: "{{ control_node_ip }}" openhpc_gres_template: /etc/ansible-init/templates/gres.conf.j2 openhpc_slurm_service_enabled: true openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}" @@ -90,25 +88,25 @@ when: _copy_nm_config.changed | default(false) - - name: Mount /etc/hosts on compute nodes + - name: Mount /mnt/cluster on compute nodes and copy hosts to /etc/hosts block: - name: Ensure the mount directory exists file: - path: /mnt/hosts + path: /mnt/cluster state: directory mode: 0755 - - name: Mount /mnt/hosts + - name: Mount /mnt/cluster mount: - path: /mnt/hosts - src: "{{ vars.control_node_ip }}:{{ nfs_export_hosts }}" + path: /mnt/cluster + src: "{{ vars.control_node_ip }}:/exports/cluster" fstype: nfs opts: rw,sync state: mounted - - name: Copy /mnt/hosts/hosts contents to /etc/hosts + - name: Copy /mnt/cluster/hosts contents to /etc/hosts copy: - src: /mnt/hosts/hosts + src: /mnt/cluster/hosts dest: /etc/hosts owner: root group: root @@ -128,27 +126,13 @@ src: "{{ nfs_server }}:{{ nfs_export }}" fstype: nfs state: "{{ nfs_client_mnt_state }}" - - - name: Ensure the mount directory exists - file: - path: /mnt/ - state: directory - mode: 0755 - - - name: Mount /mnt/ - mount: - path: /mnt/ - src: "{{ vars.control_node_ip }}:{{ nfs_export_cluster }}" - fstype: nfs - opts: rw,sync - state: mounted - name: Manila mount block: - name: Read manila share from nfs file slurp: - src: "/mnt/manila_share_info.yml" + src: "/mnt/cluster/manila_share_info.yml" register: manila_share_info_file - name: Parse and set fact for manila share info @@ -293,7 +277,7 @@ - name: Copy Munge key from NFS-mounted directory to /etc/munge copy: - src: "/mnt/openhpc_munge.key" + src: "/mnt/cluster/openhpc_munge.key" dest: "/etc/munge/munge.key" owner: munge group: munge diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index 892b8b093..40e30efae 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -1,12 +1,17 @@ --- -- name: Ensure templates directory exists +- name: Ensure directories exist file: - path: /etc/ansible-init/templates + path: "/etc/ansible-init/{{ item.directory }}" state: directory owner: root group: root - mode: 0644 + mode: 0755 + loop: + - { directory: "templates" } + - { directory: "files" } + - { directory: "library" } + - { directory: "filter_plugins" } - name: Inject templates copy: @@ -21,14 +26,6 @@ - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 - ../../stackhpc.openhpc/templates/gres.conf.j2 -- name: Ensure files directory exists - file: - path: /etc/ansible-init/files - state: directory - owner: root - group: root - mode: 0644 - - name: Inject files copy: src: '{{ item }}' @@ -39,14 +36,6 @@ loop: - ../../resolv_conf/files/NetworkManager-dns-none.conf -- name: Ensure library directory exists - file: - path: /etc/ansible-init/library - state: directory - owner: root - group: root - mode: 0644 - - name: Inject files copy: src: '{{ item }}' @@ -59,14 +48,6 @@ - ../../stackhpc.os-manila-mount/library/os_manila_share.py - ../../stackhpc.openhpc/library/sacct_cluster.py -- name: Ensure filter_plugins directory exists - file: - path: /etc/ansible-init/filter_plugins - state: directory - owner: root - group: root - mode: 0644 - - name: Inject filter_plugins copy: src: '{{ item }}' @@ -87,15 +68,39 @@ group: root mode: 0644 -- name: Write openhpc munge key - copy: - content: "{{ vault_openhpc_mungekey | b64decode }}" - dest: "/exports/cluster/openhpc_munge.key" - owner: munge - group: munge - mode: 0400 - delegate_to: "{{ groups['control'] | first }}" +- name: Ensure nfs /exports/cluster configured + block: + - name: Ensure the /exports/hosts directory exists + file: + path: /exports/cluster + state: directory + owner: root + group: root + mode: 0755 + - name: Copy /etc/hosts to /exports/cluster + copy: + src: /etc/hosts + dest: /exports/cluster/hosts + owner: root + group: root + mode: 0644 + remote_src: true + + - name: Copy manila share info to /exports/cluster + copy: + content: "{{ os_manila_mount_share_info | to_nice_yaml }}" + dest: "/exports/cluster/manila_share_info.yml" + when: os_manila_mount_share_info is defined + + - name: Write openhpc munge key + copy: + content: "{{ vault_openhpc_mungekey | b64decode }}" + dest: "/exports/cluster/openhpc_munge.key" + owner: munge + group: munge + mode: 0400 + delegate_to: "{{ groups['control'] | first }}" - name: Inject compute initialisation playbook copy: diff --git a/ansible/roles/etc_hosts/tasks/main.yml b/ansible/roles/etc_hosts/tasks/main.yml index 1d04ebf7c..6fdabf57c 100644 --- a/ansible/roles/etc_hosts/tasks/main.yml +++ b/ansible/roles/etc_hosts/tasks/main.yml @@ -6,26 +6,3 @@ group: root mode: 0644 become: yes - -- name: Ensure /exports/hosts directory exists and copy /etc/hosts - block: - - name: Ensure the /exports/hosts directory exists - file: - path: /exports/hosts - state: directory - owner: root - group: root - mode: 0755 - become: yes - delegate_to: "{{ groups['control'] | first }}" - - - name: Copy /etc/hosts to NFS exported directory - copy: - src: /etc/hosts - dest: /exports/hosts/hosts - owner: root - group: root - mode: 0644 - remote_src: true - become: yes - delegate_to: "{{ groups['control'] | first }}" \ No newline at end of file diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index 036850847..84371c99a 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -16,15 +16,8 @@ nfs_configurations: nfs_export: "/exports/home" # assumes skeleton TF is being used nfs_client_mnt_point: "/home" - - comment: Export /etc/hosts copy from Slurm control node + - comment: Export /exports/cluster from Slurm control node nfs_enable: server: "{{ inventory_hostname in groups['control'] }}" clients: false - nfs_export: "/exports/hosts" # control node has to copy in /etc/hosts to here - - - comment: Export cluster info from control node - nfs_enable: - server: "{{ inventory_hostname in groups['control']}}" - clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}" - nfs_server: "{{ nfs_server_default }}" - nfs_export: "/exports/cluster" \ No newline at end of file + nfs_export: "/exports/cluster" # control node has to copy in /etc/hosts to here From a32e3099020e48bc13162a6f20a23a7d86ae57f5 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 20 Nov 2024 10:48:27 +0000 Subject: [PATCH 010/182] remove gres.conf - no-op --- ansible/roles/compute_init/files/compute-init.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 00043f0e8..bd756c38a 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -52,7 +52,6 @@ cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" openhpc_conf_server: "{{ control_node_ip }}" - openhpc_gres_template: /etc/ansible-init/templates/gres.conf.j2 openhpc_slurm_service_enabled: true openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}" openhpc_enable: @@ -283,16 +282,6 @@ group: munge mode: 0400 - - name: Create gres.conf - template: - src: "{{ openhpc_gres_template }}" - dest: /etc/slurm/gres.conf - mode: "0600" - owner: slurm - group: slurm - when: openhpc_enable.control | default(false) - register: ohpc_gres_conf - - name: Set slurmctld location for configless operation lineinfile: path: /etc/sysconfig/slurmd From a1f71b6244570f0f18705ec627bd2aea81ba6ab5 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 20 Nov 2024 12:51:52 +0000 Subject: [PATCH 011/182] remove or hardcode some vars, make resolv_conf block conditional --- .../roles/compute_init/files/compute-init.yml | 19 +++++-------------- ansible/roles/compute_init/tasks/main.yml | 1 - 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index bd756c38a..2a42d4b7c 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -9,7 +9,6 @@ resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] - nfs_disk_location: nfs_export: "/exports/home" nfs_client_mnt_options: nfs_client_mnt_point: "/home" @@ -25,7 +24,6 @@ - noatime - _netdev # prevents mount blocking early boot before networking available - rw - os_manila_mount_share_info: [] # populated by lookup mode os_manila_mount_ceph_conf_path: /etc/ceph @@ -52,13 +50,6 @@ cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" openhpc_conf_server: "{{ control_node_ip }}" - openhpc_slurm_service_enabled: true - openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}" - openhpc_enable: - control: false - batch: true - database: false - runtime: true tasks: - name: Configure resolve.conf @@ -85,6 +76,7 @@ name: NetworkManager state: reloaded when: _copy_nm_config.changed | default(false) + when: resolv_conf_nameservers is defined and resolv_conf_nameservers | length > 0 - name: Mount /mnt/cluster on compute nodes and copy hosts to /etc/hosts @@ -295,12 +287,11 @@ - name: Configure Munge service service: name: munge - enabled: "{{ openhpc_slurm_service_enabled | bool }}" - state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" + enabled: true + state: started - name: Ensure slurmd state service: name: slurmd - enabled: "{{ openhpc_slurm_service_enabled | bool }}" - state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" - when: openhpc_enable.batch | default(false) | bool \ No newline at end of file + enabled: true + state: started \ No newline at end of file diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index 40e30efae..ca0a006a8 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -24,7 +24,6 @@ - ../../resolv_conf/templates/resolv.conf.j2 - ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 - - ../../stackhpc.openhpc/templates/gres.conf.j2 - name: Inject files copy: From 61392edc02a4c896f334b7cee6c8a4f15e0d9185 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 20 Nov 2024 14:02:47 +0000 Subject: [PATCH 012/182] move EESSI CVMFS install and config to nfs export --- .../roles/compute_init/files/compute-init.yml | 42 +++---------------- ansible/roles/compute_init/tasks/main.yml | 19 ++++++++- 2 files changed, 24 insertions(+), 37 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 2a42d4b7c..65a05e1da 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -42,13 +42,6 @@ basic_users_groups: [] - cvmfs_quota_limit_mb: 10000 - cvmfs_config_default: - CVMFS_CLIENT_PROFILE: single - CVMFS_QUOTA_LIMIT: "{{ cvmfs_quota_limit_mb }}" - cvmfs_config_overrides: {} - cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" - openhpc_conf_server: "{{ control_node_ip }}" tasks: @@ -219,36 +212,13 @@ - name: Configure EESSI block: - - name: Download Cern GPG key - ansible.builtin.get_url: - url: http://cvmrepo.web.cern.ch/cvmrepo/yum/RPM-GPG-KEY-CernVM - dest: ./cvmfs-key.gpg - - - name: Import downloaded GPG key - command: rpm --import cvmfs-key.gpg - - - name: Add CVMFS repo - dnf: - name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm - - - name: Install CVMFS - dnf: - name: cvmfs - - - name: Install EESSI CVMFS config - dnf: - name: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi-latest.noarch.rpm - # NOTE: Can't find any docs on obtaining gpg key - maybe downloading directly from github is ok? - disable_gpg_check: true - - - name: Add base CVMFS config - community.general.ini_file: + - name: Copy /mnt/cluster/cvmfs/default.local contents to /etc/cvmfs/default.local + copy: + src: /mnt/cluster/cvmfs/default.local dest: /etc/cvmfs/default.local - section: null - option: "{{ item.key }}" - value: "{{ item.value }}" - no_extra_spaces: true - loop: "{{ cvmfs_config | dict2items }}" + owner: root + group: root + mode: 0644 # NOTE: Not clear how to make this idempotent - name: Ensure CVMFS config is setup diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index ca0a006a8..0e52d8892 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -69,7 +69,7 @@ - name: Ensure nfs /exports/cluster configured block: - - name: Ensure the /exports/hosts directory exists + - name: Ensure the /exports/cluster directory exists file: path: /exports/cluster state: directory @@ -92,6 +92,23 @@ dest: "/exports/cluster/manila_share_info.yml" when: os_manila_mount_share_info is defined + - name: Ensure /exports/cluster/cvmfs directory exists + file: + path: /exports/cluster/cvmfs + state: directory + owner: root + group: root + mode: 0755 + + - name: Copy EESSI CVMFS config to /exports/cluster + copy: + src: /etc/cvmfs/default.local + dest: /exports/cluster/cvmfs/default.local + owner: root + group: root + mode: 0644 + remote_src: true + - name: Write openhpc munge key copy: content: "{{ vault_openhpc_mungekey | b64decode }}" From 51b02d3f556f5c886a8ba1a3ae8f9de637ad14a6 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 20 Nov 2024 15:28:42 +0000 Subject: [PATCH 013/182] move manila mount share to nfs export --- .../roles/compute_init/files/compute-init.yml | 18 +++++++++++------- ansible/roles/compute_init/tasks/main.yml | 6 ++++++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 65a05e1da..f78bbe9b7 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -8,15 +8,12 @@ control_node_ip: "172.16.1.228" resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] - nfs_export: "/exports/home" nfs_client_mnt_options: nfs_client_mnt_point: "/home" nfs_client_mnt_state: mounted nfs_server: "{{ control_node_ip }}" - - os_manila_mount_shares: [] os_manila_mount_state: mounted os_manila_mount_opts: - x-systemd.device-timeout=30 @@ -26,7 +23,6 @@ - rw os_manila_mount_ceph_conf_path: /etc/ceph - basic_users_manage_homedir: false basic_users_userdefaults: state: present @@ -38,10 +34,8 @@ - name: testuser # can't use rocky as $HOME isn't shared! password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent uid: 1005 - state: present basic_users_groups: [] - openhpc_conf_server: "{{ control_node_ip }}" tasks: @@ -114,15 +108,25 @@ - name: Manila mount block: - - name: Read manila share from nfs file + - name: Read manila share info from nfs file slurp: src: "/mnt/cluster/manila_share_info.yml" register: manila_share_info_file + no_log: true - name: Parse and set fact for manila share info set_fact: os_manila_mount_share_info: "{{ manila_share_info_file.content | b64decode | from_yaml }}" + - name: Read manila shares from nfs file + slurp: + src: "/mnt/cluster/manila_shares.yml" + register: manila_shares_file + + - name: Parse and set fact for manila shares + set_fact: + os_manila_mount_shares: "{{ manila_shares_file.content | b64decode | from_yaml }}" + - name: Ensure Ceph configuration directory exists ansible.builtin.file: path: "{{ os_manila_mount_ceph_conf_path }}" diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index 0e52d8892..f2bbfb72d 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -91,6 +91,12 @@ content: "{{ os_manila_mount_share_info | to_nice_yaml }}" dest: "/exports/cluster/manila_share_info.yml" when: os_manila_mount_share_info is defined + + - name: Copy manila mount shares to /exports/cluster + copy: + content: "{{ os_manila_mount_shares | to_nice_yaml }}" + dest: "/exports/cluster/manila_shares.yml" + when: os_manila_mount_shares is defined - name: Ensure /exports/cluster/cvmfs directory exists file: From 134515d2348fe67aa02a162fa05c4a3111530092 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 20 Nov 2024 15:37:48 +0000 Subject: [PATCH 014/182] Pause CI testing for branch feat/compute-script --- .github/workflows/stackhpc.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b08854adb..848517bb8 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -24,6 +24,8 @@ on: - '!.gitignore' - '!.github/workflows/' - '.github/workflows/stackhpc' + branches: + - '!feat/compute-script' jobs: openstack: name: openstack-ci From 40d9e1fe108b146c9d7b680d5834c10c940b9191 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 22 Nov 2024 16:09:47 +0000 Subject: [PATCH 015/182] replaces system repos with ark repos during ci --- .github/workflows/fatimage.yml | 2 + .github/workflows/nightlybuild.yml | 2 + ansible/.gitignore | 2 + ansible/roles/release_train/defaults/main.yml | 8 ++ .../release_train/tasks/revert_repos.yml | 19 ++++ .../roles/release_train/tasks/set_repos.yml | 22 +++++ .../templates/rocky-extras.repo.j2 | 65 +++++++++++++ .../release_train/templates/rocky.repo.j2 | 93 +++++++++++++++++++ environments/.stackhpc/hooks/post.yml | 10 +- environments/.stackhpc/hooks/pre.yml | 8 ++ packer/openstack.pkr.hcl | 6 ++ 11 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 ansible/roles/release_train/defaults/main.yml create mode 100644 ansible/roles/release_train/tasks/revert_repos.yml create mode 100644 ansible/roles/release_train/tasks/set_repos.yml create mode 100644 ansible/roles/release_train/templates/rocky-extras.repo.j2 create mode 100644 ansible/roles/release_train/templates/rocky.repo.j2 diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index a8d3dbe29..cca652ef6 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -39,6 +39,7 @@ jobs: "openstack.openhpc": "rocky-latest-RL9" } } + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: - uses: actions/checkout@v2 @@ -87,6 +88,7 @@ jobs: -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "ark_password=${ARK_PASSWORD}" \ openstack.pkr.hcl env: PKR_VAR_os_version: ${{ matrix.os_version }} diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index da3de4ea5..7fab8ebec 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -37,6 +37,7 @@ jobs: "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2", "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" } + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: - uses: actions/checkout@v2 @@ -85,6 +86,7 @@ jobs: -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "ark_password=${ARK_PASSWORD}" \ openstack.pkr.hcl env: diff --git a/ansible/.gitignore b/ansible/.gitignore index 8edcc4360..ad841dc38 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -64,3 +64,5 @@ roles/* !roles/k9s/** !roles/lustre/ !roles/lustre/** +!roles/release_train/ +!roles/release_train/** diff --git a/ansible/roles/release_train/defaults/main.yml b/ansible/roles/release_train/defaults/main.yml new file mode 100644 index 000000000..dbae5e3b9 --- /dev/null +++ b/ansible/roles/release_train/defaults/main.yml @@ -0,0 +1,8 @@ +release_train_url_prefix: https://ark.stackhpc.com/pulp/content/rocky/9.4 +release_train_url_suffix: "x86_64/os/{{ release_train_timestamp }}/" +# most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml +# note that some timestamps can't be used because not all repos have snapshots for them +release_train_timestamp: 20240816T002610 +release_train_auth: | + username = slurm-app-ci + password = {{ _github_secrets_ark_password }} \ No newline at end of file diff --git a/ansible/roles/release_train/tasks/revert_repos.yml b/ansible/roles/release_train/tasks/revert_repos.yml new file mode 100644 index 000000000..8780ac13c --- /dev/null +++ b/ansible/roles/release_train/tasks/revert_repos.yml @@ -0,0 +1,19 @@ +--- + +- name: Check for backup folder exists + stat: + path: /etc/yum.repos.d.backup + register: _stat_yum_backup_file + +- name: Fail if backup folder doesn't exist + assert: + that: _stat_yum_backup_file.stat.exists + +- name: Remove ark repos + ansible.builtin.file: + state: absent + path: /etc/yum.repos.d + +- name: Restore backup repos + ansible.builtin.shell: + cmd: mv /etc/yum.repos.d.backup /etc/yum.repos.d diff --git a/ansible/roles/release_train/tasks/set_repos.yml b/ansible/roles/release_train/tasks/set_repos.yml new file mode 100644 index 000000000..f527a85d8 --- /dev/null +++ b/ansible/roles/release_train/tasks/set_repos.yml @@ -0,0 +1,22 @@ +--- + +- name: Check for existing backup folder + stat: + path: /etc/yum.repos.d.backup + register: _stat_yum_backup_file + +- name: Backup existing package repos + ansible.builtin.copy: + remote_src: true + src: /etc/yum.repos.d/ + dest: /etc/yum.repos.d.backup + when: not _stat_yum_backup_file.stat.exists + +- name: Replace package repos with release train repos + no_log: true + ansible.builtin.template: + src: "{{ item }}.j2" + dest: /etc/yum.repos.d/{{ item }} + loop: + - rocky-extras.repo + - rocky.repo \ No newline at end of file diff --git a/ansible/roles/release_train/templates/rocky-extras.repo.j2 b/ansible/roles/release_train/templates/rocky-extras.repo.j2 new file mode 100644 index 000000000..78bed03d5 --- /dev/null +++ b/ansible/roles/release_train/templates/rocky-extras.repo.j2 @@ -0,0 +1,65 @@ +# rocky-extras.repo +# +# The mirrorlist system uses the connecting IP address of the client and the +# update status of each mirror to pick current mirrors that are geographically +# close to the client. You should use this for Rocky updates unless you are +# manually picking other mirrors. +# +# If the mirrorlist does not work for you, you can try the commented out +# baseurl line instead. + +[extras] +name=Rocky Linux $releasever - Extras +baseurl={{ release_train_url_prefix }}/extras/{{ release_train_url_suffix }} +gpgcheck=1 +enabled=1 +countme=1 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 +{{ release_train_auth }} + +[extras-debuginfo] +name=Rocky Linux $releasever - Extras Debug +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=extras-$releasever-debug$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/extras/$basearch/debug/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[extras-source] +name=Rocky Linux $releasever - Extras Source +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=extras-$releasever-source$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/extras/source/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[plus] +name=Rocky Linux $releasever - Plus +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=plus-$releasever$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/plus/$basearch/os/ +gpgcheck=1 +enabled=0 +countme=1 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[plus-debuginfo] +name=Rocky Linux $releasever - Plus - Debug +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=plus-$releasever-debug$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/plus/$basearch/debug/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[plus-source] +name=Rocky Linux $releasever - Plus - Source +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=plus-$releasever-source$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/plus/source/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 diff --git a/ansible/roles/release_train/templates/rocky.repo.j2 b/ansible/roles/release_train/templates/rocky.repo.j2 new file mode 100644 index 000000000..29d6aee42 --- /dev/null +++ b/ansible/roles/release_train/templates/rocky.repo.j2 @@ -0,0 +1,93 @@ +# rocky.repo +# +# The mirrorlist system uses the connecting IP address of the client and the +# update status of each mirror to pick current mirrors that are geographically +# close to the client. You should use this for Rocky updates unless you are +# manually picking other mirrors. +# +# If the mirrorlist does not work for you, you can try the commented out +# baseurl line instead. + +[baseos] +name=Rocky Linux $releasever - BaseOS +baseurl={{ release_train_url_prefix }}/BaseOS/{{ release_train_url_suffix }} +gpgcheck=1 +enabled=1 +countme=1 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 +{{ release_train_auth }} + +[baseos-debuginfo] +name=Rocky Linux $releasever - BaseOS - Debug +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=BaseOS-$releasever-debug$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/BaseOS/$basearch/debug/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[baseos-source] +name=Rocky Linux $releasever - BaseOS - Source +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=BaseOS-$releasever-source$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/BaseOS/source/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[appstream] +name=Rocky Linux $releasever - AppStream +baseurl={{ release_train_url_prefix }}/AppStream/{{ release_train_url_suffix }} +gpgcheck=1 +enabled=1 +countme=1 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 +{{ release_train_auth }} + +[appstream-debuginfo] +name=Rocky Linux $releasever - AppStream - Debug +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=AppStream-$releasever-debug$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/AppStream/$basearch/debug/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[appstream-source] +name=Rocky Linux $releasever - AppStream - Source +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=AppStream-$releasever-source$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/AppStream/source/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[crb] +name=Rocky Linux $releasever - CRB +baseurl={{ release_train_url_prefix }}/CRB/{{ release_train_url_suffix }} +gpgcheck=1 +enabled=0 +countme=1 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 +{{ release_train_auth }} + +[crb-debuginfo] +name=Rocky Linux $releasever - CRB - Debug +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=CRB-$releasever-debug$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/CRB/$basearch/debug/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 + +[crb-source] +name=Rocky Linux $releasever - CRB - Source +mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=CRB-$releasever-source$rltype +#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/CRB/source/tree/ +gpgcheck=1 +enabled=0 +metadata_expire=6h +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index bd60015d9..71ed02af8 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -11,4 +11,12 @@ with_items: - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock" - - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock \ No newline at end of file + - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock + +- hosts: builder + become: yes + tasks: + - name: Revert ark repos + ansible.builtin.include_role: + name: release_train + tasks_from: revert_repos.yml diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 0fdbf9f60..51e4bb5f0 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -17,3 +17,11 @@ - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/hosts.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/secrets.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/test_user.yml" + +- hosts: builder + become: yes + tasks: + - name: Replace system repos with ark + ansible.builtin.include_role: + name: release_train + tasks_from: set_repos.yml diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 52202ead1..b5e6e4790 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -167,6 +167,11 @@ variable "extra_build_image_name" { default = "extra" } +variable "ark_password" { + type = string + default = "none" +} + source "openstack" "openhpc" { # Build VM: flavor = var.flavor @@ -228,6 +233,7 @@ build { "-i", "${var.repo_root}/packer/ansible-inventory.sh", "-vv", "-e", "@${var.repo_root}/packer/openhpc_extravars.yml", # not overridable by environments + "-e", "_github_secrets_ark_password=${var.ark_password}", ] } From 9ef7d69563c39c86c0791a5b327ca1cb898fe10f Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 25 Nov 2024 08:45:01 +0000 Subject: [PATCH 016/182] now uses lookup instead of packer args --- .github/workflows/fatimage.yml | 1 - .github/workflows/nightlybuild.yml | 1 - ansible/roles/release_train/defaults/main.yml | 2 +- packer/openstack.pkr.hcl | 6 ------ 4 files changed, 1 insertion(+), 9 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index cca652ef6..217b09c22 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -88,7 +88,6 @@ jobs: -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ - -var "ark_password=${ARK_PASSWORD}" \ openstack.pkr.hcl env: PKR_VAR_os_version: ${{ matrix.os_version }} diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 7fab8ebec..9f45b0890 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -86,7 +86,6 @@ jobs: -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ - -var "ark_password=${ARK_PASSWORD}" \ openstack.pkr.hcl env: diff --git a/ansible/roles/release_train/defaults/main.yml b/ansible/roles/release_train/defaults/main.yml index dbae5e3b9..7c007c59a 100644 --- a/ansible/roles/release_train/defaults/main.yml +++ b/ansible/roles/release_train/defaults/main.yml @@ -5,4 +5,4 @@ release_train_url_suffix: "x86_64/os/{{ release_train_timestamp }}/" release_train_timestamp: 20240816T002610 release_train_auth: | username = slurm-app-ci - password = {{ _github_secrets_ark_password }} \ No newline at end of file + password = {{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }} diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index b5e6e4790..52202ead1 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -167,11 +167,6 @@ variable "extra_build_image_name" { default = "extra" } -variable "ark_password" { - type = string - default = "none" -} - source "openstack" "openhpc" { # Build VM: flavor = var.flavor @@ -233,7 +228,6 @@ build { "-i", "${var.repo_root}/packer/ansible-inventory.sh", "-vv", "-e", "@${var.repo_root}/packer/openhpc_extravars.yml", # not overridable by environments - "-e", "_github_secrets_ark_password=${var.ark_password}", ] } From a6e12438d2920f3b2d928441f3e57f01dfb78ec7 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 25 Nov 2024 08:55:04 +0000 Subject: [PATCH 017/182] only applies to RL9 for now --- environments/.stackhpc/hooks/post.yml | 1 + environments/.stackhpc/hooks/pre.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index 71ed02af8..9622797ef 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -20,3 +20,4 @@ ansible.builtin.include_role: name: release_train tasks_from: revert_repos.yml + when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 51e4bb5f0..a15df2cd4 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -25,3 +25,4 @@ ansible.builtin.include_role: name: release_train tasks_from: set_repos.yml + when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided From 3e80268714e8a50c9d78a773e02f12daeac6240f Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 25 Nov 2024 12:17:16 +0000 Subject: [PATCH 018/182] set up rocky-latest-test builds and ci --- .github/workflows/fatimage.yml | 4 ++-- .github/workflows/nightlybuild.yml | 4 ++-- packer/openstack.pkr.hcl | 8 ++++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index a8d3dbe29..d40504168 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -33,10 +33,10 @@ jobs: SOURCE_IMAGES_MAP: | { "RL8": { - "openstack.openhpc": "rocky-latest-RL8" + "openstack.openhpc": "rocky-latest-test-RL8" }, "RL9": { - "openstack.openhpc": "rocky-latest-RL9" + "openstack.openhpc": "rocky-latest-test-RL9" } } diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index da3de4ea5..66cbe8ba7 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -27,7 +27,7 @@ jobs: - RL8 - RL9 build: - - openstack.rocky-latest + - openstack.rocky-latest-test env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -138,7 +138,7 @@ jobs: - RL8 - RL9 image: - - rocky-latest + - rocky-latest-test exclude: - target_cloud: LEAFCLOUD env: diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 52202ead1..6fb1ff633 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -127,6 +127,7 @@ variable "volume_size" { default = { # fat image builds, GB: rocky-latest = 15 + rocky-latest-test = 15 openhpc = 15 } } @@ -152,6 +153,7 @@ variable "groups" { default = { # fat image builds: rocky-latest = ["update"] + rocky-latest-test = ["update"] openhpc = ["control", "compute", "login"] } } @@ -206,6 +208,12 @@ build { image_name = "${source.name}-${var.os_version}" } + # latest nightly image test: + source "source.openstack.openhpc" { + name = "rocky-latest-test" + image_name = "${source.name}-${var.os_version}" + } + # fat image: source "source.openstack.openhpc" { name = "openhpc" From 151746cad8d4d3abad63eab9df712392b2968e88 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 25 Nov 2024 13:25:32 +0000 Subject: [PATCH 019/182] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index f9a2087c8..71a9162f8 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241118-0918-4538c6df", - "RL9": "openhpc-RL9-241118-0918-4538c6df" + "RL8": "openhpc-RL8-241125-1232-3e802687", + "RL9": "openhpc-RL9-241125-1232-3e802687" } } From 9c3301c28ebdccc8bb5574b50c12de7e75ef971a Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 25 Nov 2024 13:28:26 +0000 Subject: [PATCH 020/182] CI_CLOUD PR label override for trivy scan --- .github/workflows/trivyscan.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/trivyscan.yml b/.github/workflows/trivyscan.yml index 4c090b85a..5b65baca1 100644 --- a/.github/workflows/trivyscan.yml +++ b/.github/workflows/trivyscan.yml @@ -25,6 +25,20 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Override CI_CLOUD if PR label is present + if: ${{ github.event_name == 'pull_request' }} + run: | + # Iterate over the labels + labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name') + echo $labels + for label in $labels; do + if [[ $label == CI_CLOUD=* ]]; then + # Extract the value after 'CI_CLOUD=' + CI_CLOUD_OVERRIDE=${label#CI_CLOUD=} + echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV + fi + done + - name: Record settings for CI cloud run: | echo CI_CLOUD: ${{ env.CI_CLOUD }} From b2b21603b4246266a72f6d7b304a1ca086eaa762 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 25 Nov 2024 14:19:33 +0000 Subject: [PATCH 021/182] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 71a9162f8..70422736e 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241125-1232-3e802687", - "RL9": "openhpc-RL9-241125-1232-3e802687" + "RL8": "openhpc-RL8-241125-1349-9c3301c2", + "RL9": "openhpc-RL9-241125-1349-9c3301c2" } } From 0da074ba5b52fd0dfb63e824f88743f6e53bf562 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 25 Nov 2024 15:31:55 +0000 Subject: [PATCH 022/182] bump containers.podman collection version --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 3d8c44011..142f377e5 100644 --- a/requirements.yml +++ b/requirements.yml @@ -25,7 +25,7 @@ roles: collections: - name: containers.podman - version: 1.10.2 + version: 1.16.2 - name: community.grafana version: 1.5.4 - name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools From 5ae1888fd7a70559c7b6575368adefaa25e6ddbc Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 25 Nov 2024 22:30:00 +0000 Subject: [PATCH 023/182] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 70422736e..a4b65df6e 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241125-1349-9c3301c2", - "RL9": "openhpc-RL9-241125-1349-9c3301c2" + "RL8": "openhpc-RL8-241125-1804-0da074ba", + "RL9": "openhpc-RL9-241125-1804-0da074ba" } } From b4d2d19d22fa6fb18c34bf4b1551ef4eae38569b Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 26 Nov 2024 09:57:57 +0000 Subject: [PATCH 024/182] debug site.yml --- .github/workflows/stackhpc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b08854adb..17b142713 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -124,7 +124,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible all -m wait_for_connection - ansible-playbook -v ansible/site.yml + ansible-playbook -vvv ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml - name: Run MPI-based tests From 88e23de80019c30513b358b6c6d80bbd6c8b5acf Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 26 Nov 2024 10:21:37 +0000 Subject: [PATCH 025/182] mysql latest From 1eeef3790b91fa844290a968d438947d0f48e2ee Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Tue, 26 Nov 2024 10:45:01 +0000 Subject: [PATCH 026/182] Bump openhpc role for slurm restart, templating and nodes in multiple groups --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 3d8c44011..579cdd5d5 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ roles: - src: stackhpc.nfs version: v23.12.1 # Tolerate state nfs file handles - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v0.26.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/168 + version: v0.27.0 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc From 6671d69c44de8dc7d5cc1ed15ad4b136eed58215 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 26 Nov 2024 12:00:52 +0000 Subject: [PATCH 027/182] bump mysql From f66feb9ba5c1b15c5a8d3e6ca25e748549a755c5 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 27 Nov 2024 11:51:33 +0000 Subject: [PATCH 028/182] simplify slurm-init file injection loop --- ansible/roles/compute_init/tasks/main.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index f2bbfb72d..15ba586d1 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -2,16 +2,16 @@ - name: Ensure directories exist file: - path: "/etc/ansible-init/{{ item.directory }}" + path: "/etc/ansible-init/{{ item }}" state: directory owner: root group: root mode: 0755 loop: - - { directory: "templates" } - - { directory: "files" } - - { directory: "library" } - - { directory: "filter_plugins" } + - templates + - files + - library + - filter_plugins - name: Inject templates copy: @@ -35,7 +35,7 @@ loop: - ../../resolv_conf/files/NetworkManager-dns-none.conf -- name: Inject files +- name: Inject libraries copy: src: '{{ item }}' dest: '/etc/ansible-init/library/{{ item | basename }}' From 6a8266c37c0aa5e6c321eabbcd539927d12e599f Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 27 Nov 2024 16:37:32 +0000 Subject: [PATCH 029/182] clear podman temp files on startup --- ansible/roles/podman/tasks/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/podman/tasks/config.yml b/ansible/roles/podman/tasks/config.yml index 5fea3c2e0..74cf1d576 100644 --- a/ansible/roles/podman/tasks/config.yml +++ b/ansible/roles/podman/tasks/config.yml @@ -55,6 +55,7 @@ # Type Path Mode User Group Age Argument R! /tmp/containers-user-* R! /tmp/podman-run-* + R! /tmp/storage-run-* dest: /etc/tmpfiles.d/podman-local.conf owner: root group: root From 33ffa655daafe55bf7d613664fdf8547716d94a2 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 27 Nov 2024 20:13:35 +0000 Subject: [PATCH 030/182] bump new images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index a4b65df6e..830a96499 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241125-1804-0da074ba", - "RL9": "openhpc-RL9-241125-1804-0da074ba" + "RL8": "openhpc-RL8-241127-1704-6a8266c3", + "RL9": "openhpc-RL9-241127-1704-6a8266c3" } } From f4c5cfe639b7b6145bea5f796cf2b9fdc7d96718 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 28 Nov 2024 10:52:22 +0000 Subject: [PATCH 031/182] stop using rocky-latest-test images in CI --- .github/workflows/fatimage.yml | 4 ++-- .github/workflows/nightlybuild.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index d40504168..a8d3dbe29 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -33,10 +33,10 @@ jobs: SOURCE_IMAGES_MAP: | { "RL8": { - "openstack.openhpc": "rocky-latest-test-RL8" + "openstack.openhpc": "rocky-latest-RL8" }, "RL9": { - "openstack.openhpc": "rocky-latest-test-RL9" + "openstack.openhpc": "rocky-latest-RL9" } } diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 66cbe8ba7..da3de4ea5 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -27,7 +27,7 @@ jobs: - RL8 - RL9 build: - - openstack.rocky-latest-test + - openstack.rocky-latest env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -138,7 +138,7 @@ jobs: - RL8 - RL9 image: - - rocky-latest-test + - rocky-latest exclude: - target_cloud: LEAFCLOUD env: From d7a8dd20110abaf759a1bc0bea3d706282e2d242 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 28 Nov 2024 10:54:34 +0000 Subject: [PATCH 032/182] low verbosity CI site.yml --- .github/workflows/stackhpc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 17b142713..b08854adb 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -124,7 +124,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible all -m wait_for_connection - ansible-playbook -vvv ansible/site.yml + ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml - name: Run MPI-based tests From 6faf91958fefdd424a0330956a70c0aca2d1a53c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 29 Nov 2024 15:48:07 +0000 Subject: [PATCH 033/182] refactored ark role, disabled repos at end of build and modified site to work with disabled repos --- ansible/.gitignore | 4 +- ansible/fatimage.yml | 4 + ansible/roles/dnf_repos/defaults/main.yml | 24 +++++ .../roles/dnf_repos/tasks/disable_repos.yml | 18 ++++ ansible/roles/dnf_repos/tasks/set_repos.yml | 25 +++++ ansible/roles/openondemand/tasks/main.yml | 1 + ansible/roles/openondemand/tasks/pam_auth.yml | 3 - ansible/roles/release_train/defaults/main.yml | 8 -- .../release_train/tasks/revert_repos.yml | 19 ---- .../roles/release_train/tasks/set_repos.yml | 22 ----- .../templates/rocky-extras.repo.j2 | 65 ------------- .../release_train/templates/rocky.repo.j2 | 93 ------------------- ansible/slurm.yml | 1 + environments/.stackhpc/hooks/post.yml | 6 +- environments/.stackhpc/hooks/pre.yml | 2 +- .../inventory/group_vars/all/defaults.yml | 1 + .../inventory/group_vars/all/openhpc.yml | 10 ++ packer/openhpc_extravars.yml | 1 + 18 files changed, 91 insertions(+), 216 deletions(-) create mode 100644 ansible/roles/dnf_repos/defaults/main.yml create mode 100644 ansible/roles/dnf_repos/tasks/disable_repos.yml create mode 100644 ansible/roles/dnf_repos/tasks/set_repos.yml delete mode 100644 ansible/roles/release_train/defaults/main.yml delete mode 100644 ansible/roles/release_train/tasks/revert_repos.yml delete mode 100644 ansible/roles/release_train/tasks/set_repos.yml delete mode 100644 ansible/roles/release_train/templates/rocky-extras.repo.j2 delete mode 100644 ansible/roles/release_train/templates/rocky.repo.j2 diff --git a/ansible/.gitignore b/ansible/.gitignore index ad841dc38..48c917c4f 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -64,5 +64,5 @@ roles/* !roles/k9s/** !roles/lustre/ !roles/lustre/** -!roles/release_train/ -!roles/release_train/** +!roles/dnf_repos/ +!roles/dnf_repos/** diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 7cad2dc59..ec0d4dd74 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -69,6 +69,10 @@ tasks_from: install.yml when: "'openhpc' in group_names" + - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build + yum: + name: mod_authnz_pam + # - import_playbook: portal.yml - name: Open Ondemand server (packages) include_role: diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml new file mode 100644 index 000000000..00778533c --- /dev/null +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -0,0 +1,24 @@ +dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/9.4 +dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_ark_timestamp }}/" +# most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml +# note that some timestamps can't be used because not all repos have snapshots for them +dnf_repos_ark_timestamp: 20240816T002610 +dnf_repos_username: slurm-app-ci +dnf_repos_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" + +# epel installed separately +dnf_repos_repolist: +- file: rocky + name: baseos + base_url: "{{ dnf_repos_rocky_ark_prefix }}/BaseOS/{{ dnf_repos_rocky_ark_suffix }}" +- file: rocky + name: appstream + base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" +- file: rocky + name: crb + base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" +- file: rocky-extras + name: extras + base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" + +dnf_repos_epel_baseurl: https://ark.stackhpc.com/pulp/content/epel/9/Everything/x86_64/20240902T080424 diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml new file mode 100644 index 000000000..f8997b741 --- /dev/null +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -0,0 +1,18 @@ +--- +- name: Disable Pulp repos and remove creds + ansible.builtin.yum_repository: + file: "{{ item.file }}" + name: "{{ item.name }}" + baseurl: "{{ item.base_url }}" + description: "{{ item.name }}" + enabled: false + loop: "{{ dnf_repos_repolist }}" + +- name: Disable EPEL repo and remove creds + ansible.builtin.yum_repository: + name: epel + file: epel + description: epel + baseurl: "{{ dnf_repos_epel_baseurl }}" + gpgcheck: false + enabled: false diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml new file mode 100644 index 000000000..2c51b96ae --- /dev/null +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -0,0 +1,25 @@ +--- + +- name: Replace system repos with Pulp repos + ansible.builtin.yum_repository: + file: "{{ item.file }}" + name: "{{ item.name }}" + baseurl: "{{ item.base_url }}" + description: "{{ item.name }}" + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" + loop: "{{ dnf_repos_repolist }}" + +- name: Install epel-release + ansible.builtin.dnf: + name: epel-release + +- name: Use Pulp EPEL repo + ansible.builtin.yum_repository: + name: epel + file: epel + description: epel + gpgcheck: false + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" + baseurl: "{{ dnf_repos_epel_baseurl }}" diff --git a/ansible/roles/openondemand/tasks/main.yml b/ansible/roles/openondemand/tasks/main.yml index 86184f13c..a9b975c5b 100644 --- a/ansible/roles/openondemand/tasks/main.yml +++ b/ansible/roles/openondemand/tasks/main.yml @@ -12,6 +12,7 @@ tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" public: yes # Expose the vars from this role to the rest of the play + when: appliances_mode != 'configure' # can't set vars: from a dict hence the workaround above - include_tasks: diff --git a/ansible/roles/openondemand/tasks/pam_auth.yml b/ansible/roles/openondemand/tasks/pam_auth.yml index 0edce622f..3ede2d3ce 100644 --- a/ansible/roles/openondemand/tasks/pam_auth.yml +++ b/ansible/roles/openondemand/tasks/pam_auth.yml @@ -1,8 +1,5 @@ # https://osc.github.io/ood-documentation/latest/authentication/pam.html --- -- name: Install Apache PAM module - yum: - name: mod_authnz_pam - name: Enable Apache PAM module lineinfile: diff --git a/ansible/roles/release_train/defaults/main.yml b/ansible/roles/release_train/defaults/main.yml deleted file mode 100644 index 7c007c59a..000000000 --- a/ansible/roles/release_train/defaults/main.yml +++ /dev/null @@ -1,8 +0,0 @@ -release_train_url_prefix: https://ark.stackhpc.com/pulp/content/rocky/9.4 -release_train_url_suffix: "x86_64/os/{{ release_train_timestamp }}/" -# most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml -# note that some timestamps can't be used because not all repos have snapshots for them -release_train_timestamp: 20240816T002610 -release_train_auth: | - username = slurm-app-ci - password = {{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }} diff --git a/ansible/roles/release_train/tasks/revert_repos.yml b/ansible/roles/release_train/tasks/revert_repos.yml deleted file mode 100644 index 8780ac13c..000000000 --- a/ansible/roles/release_train/tasks/revert_repos.yml +++ /dev/null @@ -1,19 +0,0 @@ ---- - -- name: Check for backup folder exists - stat: - path: /etc/yum.repos.d.backup - register: _stat_yum_backup_file - -- name: Fail if backup folder doesn't exist - assert: - that: _stat_yum_backup_file.stat.exists - -- name: Remove ark repos - ansible.builtin.file: - state: absent - path: /etc/yum.repos.d - -- name: Restore backup repos - ansible.builtin.shell: - cmd: mv /etc/yum.repos.d.backup /etc/yum.repos.d diff --git a/ansible/roles/release_train/tasks/set_repos.yml b/ansible/roles/release_train/tasks/set_repos.yml deleted file mode 100644 index f527a85d8..000000000 --- a/ansible/roles/release_train/tasks/set_repos.yml +++ /dev/null @@ -1,22 +0,0 @@ ---- - -- name: Check for existing backup folder - stat: - path: /etc/yum.repos.d.backup - register: _stat_yum_backup_file - -- name: Backup existing package repos - ansible.builtin.copy: - remote_src: true - src: /etc/yum.repos.d/ - dest: /etc/yum.repos.d.backup - when: not _stat_yum_backup_file.stat.exists - -- name: Replace package repos with release train repos - no_log: true - ansible.builtin.template: - src: "{{ item }}.j2" - dest: /etc/yum.repos.d/{{ item }} - loop: - - rocky-extras.repo - - rocky.repo \ No newline at end of file diff --git a/ansible/roles/release_train/templates/rocky-extras.repo.j2 b/ansible/roles/release_train/templates/rocky-extras.repo.j2 deleted file mode 100644 index 78bed03d5..000000000 --- a/ansible/roles/release_train/templates/rocky-extras.repo.j2 +++ /dev/null @@ -1,65 +0,0 @@ -# rocky-extras.repo -# -# The mirrorlist system uses the connecting IP address of the client and the -# update status of each mirror to pick current mirrors that are geographically -# close to the client. You should use this for Rocky updates unless you are -# manually picking other mirrors. -# -# If the mirrorlist does not work for you, you can try the commented out -# baseurl line instead. - -[extras] -name=Rocky Linux $releasever - Extras -baseurl={{ release_train_url_prefix }}/extras/{{ release_train_url_suffix }} -gpgcheck=1 -enabled=1 -countme=1 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 -{{ release_train_auth }} - -[extras-debuginfo] -name=Rocky Linux $releasever - Extras Debug -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=extras-$releasever-debug$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/extras/$basearch/debug/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[extras-source] -name=Rocky Linux $releasever - Extras Source -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=extras-$releasever-source$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/extras/source/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[plus] -name=Rocky Linux $releasever - Plus -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=plus-$releasever$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/plus/$basearch/os/ -gpgcheck=1 -enabled=0 -countme=1 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[plus-debuginfo] -name=Rocky Linux $releasever - Plus - Debug -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=plus-$releasever-debug$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/plus/$basearch/debug/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[plus-source] -name=Rocky Linux $releasever - Plus - Source -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=plus-$releasever-source$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/plus/source/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 diff --git a/ansible/roles/release_train/templates/rocky.repo.j2 b/ansible/roles/release_train/templates/rocky.repo.j2 deleted file mode 100644 index 29d6aee42..000000000 --- a/ansible/roles/release_train/templates/rocky.repo.j2 +++ /dev/null @@ -1,93 +0,0 @@ -# rocky.repo -# -# The mirrorlist system uses the connecting IP address of the client and the -# update status of each mirror to pick current mirrors that are geographically -# close to the client. You should use this for Rocky updates unless you are -# manually picking other mirrors. -# -# If the mirrorlist does not work for you, you can try the commented out -# baseurl line instead. - -[baseos] -name=Rocky Linux $releasever - BaseOS -baseurl={{ release_train_url_prefix }}/BaseOS/{{ release_train_url_suffix }} -gpgcheck=1 -enabled=1 -countme=1 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 -{{ release_train_auth }} - -[baseos-debuginfo] -name=Rocky Linux $releasever - BaseOS - Debug -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=BaseOS-$releasever-debug$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/BaseOS/$basearch/debug/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[baseos-source] -name=Rocky Linux $releasever - BaseOS - Source -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=BaseOS-$releasever-source$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/BaseOS/source/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[appstream] -name=Rocky Linux $releasever - AppStream -baseurl={{ release_train_url_prefix }}/AppStream/{{ release_train_url_suffix }} -gpgcheck=1 -enabled=1 -countme=1 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 -{{ release_train_auth }} - -[appstream-debuginfo] -name=Rocky Linux $releasever - AppStream - Debug -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=AppStream-$releasever-debug$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/AppStream/$basearch/debug/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[appstream-source] -name=Rocky Linux $releasever - AppStream - Source -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=AppStream-$releasever-source$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/AppStream/source/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[crb] -name=Rocky Linux $releasever - CRB -baseurl={{ release_train_url_prefix }}/CRB/{{ release_train_url_suffix }} -gpgcheck=1 -enabled=0 -countme=1 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 -{{ release_train_auth }} - -[crb-debuginfo] -name=Rocky Linux $releasever - CRB - Debug -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=$basearch&repo=CRB-$releasever-debug$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/CRB/$basearch/debug/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 - -[crb-source] -name=Rocky Linux $releasever - CRB - Source -mirrorlist=https://mirrors.rockylinux.org/mirrorlist?arch=source&repo=CRB-$releasever-source$rltype -#baseurl=http://dl.rockylinux.org/$contentdir/$releasever/CRB/source/tree/ -gpgcheck=1 -enabled=0 -metadata_expire=6h -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9 diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 0b7397242..f2d37a60c 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -27,6 +27,7 @@ tasks: - import_role: name: stackhpc.openhpc + tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" - name: Set locked memory limits on user-facing nodes hosts: diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index 9622797ef..98e366304 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -16,8 +16,8 @@ - hosts: builder become: yes tasks: - - name: Revert ark repos + - name: Disable ark repos ansible.builtin.include_role: - name: release_train - tasks_from: revert_repos.yml + name: dnf_repos + tasks_from: disable_repos.yml when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index a15df2cd4..9ea84740d 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -23,6 +23,6 @@ tasks: - name: Replace system repos with ark ansible.builtin.include_role: - name: release_train + name: dnf_repos tasks_from: set_repos.yml when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 15340820f..2a88f035d 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -6,6 +6,7 @@ appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }} appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it #appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform +appliances_mode: configure # Address(ip/dns) for internal communication between services. This is # normally traffic you do no want to expose to users. diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index c613fc697..a23bc77ba 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -38,3 +38,13 @@ openhpc_config_default: openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}" + +ohpc_default_extra_repos: + "9": [] #overriding to ensure doesn't overwrite ark epel repo + "8": + - name: epel + file: epel + description: "Extra Packages for Enterprise Linux 8 - $basearch" + metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" + gpgcheck: true + gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" diff --git a/packer/openhpc_extravars.yml b/packer/openhpc_extravars.yml index 66f668649..e68741c01 100644 --- a/packer/openhpc_extravars.yml +++ b/packer/openhpc_extravars.yml @@ -1 +1,2 @@ workaround_ansible_issue_61497: yes # extravars files can't be empty +appliances_mode: "build" From 0bc473c27b5c29fa15d87da059ef88d438d58766 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 3 Dec 2024 14:12:42 +0000 Subject: [PATCH 034/182] fixed ood install with disbaled repos + fixed ark CRB typo --- ansible/fatimage.yml | 8 ++++---- ansible/roles/dnf_repos/defaults/main.yml | 2 +- ansible/roles/openondemand/tasks/main.yml | 8 +++++++- ansible/roles/openondemand/tasks/pam_auth.yml | 3 +++ ansible/roles/openondemand/tasks/vnc_compute.yml | 1 + 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index ec0d4dd74..b28e4f308 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -69,10 +69,6 @@ tasks_from: install.yml when: "'openhpc' in group_names" - - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build - yum: - name: mod_authnz_pam - # - import_playbook: portal.yml - name: Open Ondemand server (packages) include_role: @@ -102,6 +98,10 @@ tasks_from: jupyter_compute.yml when: "'openondemand_jupyter' in group_names" + - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build + yum: + name: mod_authnz_pam + # - import_playbook: monitoring.yml: - import_role: name: opensearch diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 00778533c..000ae3524 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -16,7 +16,7 @@ dnf_repos_repolist: base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" - file: rocky name: crb - base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" + base_url: "{{ dnf_repos_rocky_ark_prefix }}/CRB/{{ dnf_repos_rocky_ark_suffix }}" - file: rocky-extras name: extras base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" diff --git a/ansible/roles/openondemand/tasks/main.yml b/ansible/roles/openondemand/tasks/main.yml index a9b975c5b..bd5706ecb 100644 --- a/ansible/roles/openondemand/tasks/main.yml +++ b/ansible/roles/openondemand/tasks/main.yml @@ -6,12 +6,18 @@ loop: "{{ openondemand_osc_ood_defaults | dict2items }}" when: (item.key in hostvars[inventory_hostname]) or (item.value) +# osc.ood variables are exposed to play here instead of setting 'public' in include role so that they will still be exposed during runtime +- ansible.builtin.include_vars: + dir: "{{ playbook_dir }}/roles/osc.ood/defaults/main" + +- ansible.builtin.include_vars: + file: "{{ playbook_dir }}/roles/osc.ood/vars/Rocky/{{ ansible_distribution_major_version }}.yml" + # if using PAM auth we need apache installed but NOT started so split the osc.ood role up: - include_role: name: osc.ood tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" - public: yes # Expose the vars from this role to the rest of the play when: appliances_mode != 'configure' # can't set vars: from a dict hence the workaround above diff --git a/ansible/roles/openondemand/tasks/pam_auth.yml b/ansible/roles/openondemand/tasks/pam_auth.yml index 3ede2d3ce..6bc4bda36 100644 --- a/ansible/roles/openondemand/tasks/pam_auth.yml +++ b/ansible/roles/openondemand/tasks/pam_auth.yml @@ -1,5 +1,8 @@ # https://osc.github.io/ood-documentation/latest/authentication/pam.html --- +- name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build + yum: + name: mod_authnz_pam - name: Enable Apache PAM module lineinfile: diff --git a/ansible/roles/openondemand/tasks/vnc_compute.yml b/ansible/roles/openondemand/tasks/vnc_compute.yml index 388e3b3c5..6ec340249 100644 --- a/ansible/roles/openondemand/tasks/vnc_compute.yml +++ b/ansible/roles/openondemand/tasks/vnc_compute.yml @@ -48,6 +48,7 @@ tags: install yum: name: '@Xfce' + when: appliances_mode != 'configure' # dnf group/module installs aren't idempotent so only run during build # - name: Ensure python3.9 installed # dnf: From 364ec79252f11d707b8705068676e23e876357aa Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 3 Dec 2024 16:12:18 +0000 Subject: [PATCH 035/182] fixed eessi install and slurm not loading appliances_mode --- ansible/roles/eessi/tasks/main.yaml | 1 + ansible/slurm.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ansible/roles/eessi/tasks/main.yaml b/ansible/roles/eessi/tasks/main.yaml index d121b6fdd..c61625b0e 100644 --- a/ansible/roles/eessi/tasks/main.yaml +++ b/ansible/roles/eessi/tasks/main.yaml @@ -10,6 +10,7 @@ - name: Add CVMFS repo dnf: name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm + disable_gpg_check: true - name: Install CVMFS dnf: diff --git a/ansible/slurm.yml b/ansible/slurm.yml index f2d37a60c..cf282f786 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -25,7 +25,7 @@ tags: - openhpc tasks: - - import_role: + - include_role: name: stackhpc.openhpc tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" From b0558b95a162064d3a058d43fa012da2d3660a5a Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 3 Dec 2024 16:29:57 +0000 Subject: [PATCH 036/182] variables renames + more ansible facts in dnf_repos --- ansible/roles/dnf_repos/defaults/main.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 000ae3524..a3e05d0e1 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -1,8 +1,8 @@ -dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/9.4 -dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_ark_timestamp }}/" +dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/{{ ansible_distribution_version }} +dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_rocky_ark_timestamp }}/" # most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml # note that some timestamps can't be used because not all repos have snapshots for them -dnf_repos_ark_timestamp: 20240816T002610 +dnf_repos_rocky_ark_timestamp: 20240816T002610 dnf_repos_username: slurm-app-ci dnf_repos_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" @@ -21,4 +21,5 @@ dnf_repos_repolist: name: extras base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" -dnf_repos_epel_baseurl: https://ark.stackhpc.com/pulp/content/epel/9/Everything/x86_64/20240902T080424 +dnf_repos_epel_timestamp: 20240902T080424 +dnf_repos_epel_baseurl: "https://ark.stackhpc.com/pulp/content/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ dnf_repos_epel_timestamp }}" From 3131bd6d600c13f73bcd2336c3f74bda07d65af9 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 3 Dec 2024 18:17:33 +0000 Subject: [PATCH 037/182] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index f9a2087c8..14c997596 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241118-0918-4538c6df", - "RL9": "openhpc-RL9-241118-0918-4538c6df" + "RL8": "openhpc-RL8-241203-1659-b0558b95", + "RL9": "openhpc-RL9-241203-1659-b0558b95" } } From 1be9c6b7697e1d0a292f27ca4ee5f2702c8612fd Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 4 Dec 2024 10:00:44 +0000 Subject: [PATCH 038/182] added review comment Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/dnf_repos/tasks/set_repos.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index 2c51b96ae..f8cca5600 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -11,6 +11,7 @@ loop: "{{ dnf_repos_repolist }}" - name: Install epel-release + # done so that roles installing epel via epel-release don't over-write our changes to the epel repo ansible.builtin.dnf: name: epel-release From b7670e94d371118f0eb1c5084d1a7a4044f6665a Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 4 Dec 2024 10:10:00 +0000 Subject: [PATCH 039/182] moved config into builder and .stackhpc --- .../inventory/group_vars/openhpc/overrides.yml | 10 ++++++++++ .../common/inventory/group_vars/all/openhpc.yml | 10 ---------- .../common/inventory/group_vars/builder/defaults.yml | 1 + packer/openhpc_extravars.yml | 1 - 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml b/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml index 5aac5f8ad..858dfd9d3 100644 --- a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml +++ b/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml @@ -1,3 +1,13 @@ openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug + +ohpc_default_extra_repos: + "9": [] #overriding to ensure doesn't overwrite ark epel repo + "8": + - name: epel + file: epel + description: "Extra Packages for Enterprise Linux 8 - $basearch" + metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" + gpgcheck: true + gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index a23bc77ba..c613fc697 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -38,13 +38,3 @@ openhpc_config_default: openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}" - -ohpc_default_extra_repos: - "9": [] #overriding to ensure doesn't overwrite ark epel repo - "8": - - name: epel - file: epel - description: "Extra Packages for Enterprise Linux 8 - $basearch" - metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" - gpgcheck: true - gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" diff --git a/environments/common/inventory/group_vars/builder/defaults.yml b/environments/common/inventory/group_vars/builder/defaults.yml index 22042c1bf..b43d9f03c 100644 --- a/environments/common/inventory/group_vars/builder/defaults.yml +++ b/environments/common/inventory/group_vars/builder/defaults.yml @@ -22,3 +22,4 @@ squid_cache_disk: 0 # just needs to be defined squid_cache_mem: 0 tuned_started: false tuned_enabled: false +appliances_mode: build diff --git a/packer/openhpc_extravars.yml b/packer/openhpc_extravars.yml index e68741c01..66f668649 100644 --- a/packer/openhpc_extravars.yml +++ b/packer/openhpc_extravars.yml @@ -1,2 +1 @@ workaround_ansible_issue_61497: yes # extravars files can't be empty -appliances_mode: "build" From 2230bb8af6b8bede2ea1f712913d76e59a70f79b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 4 Dec 2024 10:17:26 +0000 Subject: [PATCH 040/182] overriding openhpc extra repos in common --- .../inventory/group_vars/openhpc/overrides.yml | 10 ---------- .../common/inventory/group_vars/all/openhpc.yml | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml b/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml index 858dfd9d3..5aac5f8ad 100644 --- a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml +++ b/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml @@ -1,13 +1,3 @@ openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug - -ohpc_default_extra_repos: - "9": [] #overriding to ensure doesn't overwrite ark epel repo - "8": - - name: epel - file: epel - description: "Extra Packages for Enterprise Linux 8 - $basearch" - metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" - gpgcheck: true - gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index c613fc697..a23bc77ba 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -38,3 +38,13 @@ openhpc_config_default: openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}" + +ohpc_default_extra_repos: + "9": [] #overriding to ensure doesn't overwrite ark epel repo + "8": + - name: epel + file: epel + description: "Extra Packages for Enterprise Linux 8 - $basearch" + metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" + gpgcheck: true + gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" From 4de581c71f2ef9976aa044286b4ed12c29b729cd Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 4 Dec 2024 16:50:21 +0000 Subject: [PATCH 041/182] Use rocky 9.4 release train snapshots for builds (#486) * replaces system repos with ark repos during ci * now uses lookup instead of packer args * only applies to RL9 for now * refactored ark role, disabled repos at end of build and modified site to work with disabled repos * fixed ood install with disbaled repos + fixed ark CRB typo * fixed eessi install and slurm not loading appliances_mode * variables renames + more ansible facts in dnf_repos * bump images * added review comment Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * moved config into builder and .stackhpc --------- Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- .github/workflows/fatimage.yml | 1 + .github/workflows/nightlybuild.yml | 1 + ansible/.gitignore | 2 ++ ansible/fatimage.yml | 4 +++ ansible/roles/dnf_repos/defaults/main.yml | 25 ++++++++++++++++++ .../roles/dnf_repos/tasks/disable_repos.yml | 18 +++++++++++++ ansible/roles/dnf_repos/tasks/set_repos.yml | 26 +++++++++++++++++++ ansible/roles/eessi/tasks/main.yaml | 1 + ansible/roles/openondemand/tasks/main.yml | 9 ++++++- ansible/roles/openondemand/tasks/pam_auth.yml | 2 +- .../roles/openondemand/tasks/vnc_compute.yml | 1 + ansible/slurm.yml | 3 ++- environments/.stackhpc/hooks/post.yml | 11 +++++++- environments/.stackhpc/hooks/pre.yml | 9 +++++++ .../group_vars/openhpc/overrides.yml | 10 +++++++ .../terraform/cluster_image.auto.tfvars.json | 4 +-- .../inventory/group_vars/all/defaults.yml | 1 + .../inventory/group_vars/builder/defaults.yml | 1 + 18 files changed, 123 insertions(+), 6 deletions(-) create mode 100644 ansible/roles/dnf_repos/defaults/main.yml create mode 100644 ansible/roles/dnf_repos/tasks/disable_repos.yml create mode 100644 ansible/roles/dnf_repos/tasks/set_repos.yml diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index a8d3dbe29..217b09c22 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -39,6 +39,7 @@ jobs: "openstack.openhpc": "rocky-latest-RL9" } } + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index da3de4ea5..9f45b0890 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -37,6 +37,7 @@ jobs: "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2", "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" } + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: - uses: actions/checkout@v2 diff --git a/ansible/.gitignore b/ansible/.gitignore index 8edcc4360..48c917c4f 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -64,3 +64,5 @@ roles/* !roles/k9s/** !roles/lustre/ !roles/lustre/** +!roles/dnf_repos/ +!roles/dnf_repos/** diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 7cad2dc59..b28e4f308 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -98,6 +98,10 @@ tasks_from: jupyter_compute.yml when: "'openondemand_jupyter' in group_names" + - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build + yum: + name: mod_authnz_pam + # - import_playbook: monitoring.yml: - import_role: name: opensearch diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml new file mode 100644 index 000000000..a3e05d0e1 --- /dev/null +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -0,0 +1,25 @@ +dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/{{ ansible_distribution_version }} +dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_rocky_ark_timestamp }}/" +# most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml +# note that some timestamps can't be used because not all repos have snapshots for them +dnf_repos_rocky_ark_timestamp: 20240816T002610 +dnf_repos_username: slurm-app-ci +dnf_repos_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" + +# epel installed separately +dnf_repos_repolist: +- file: rocky + name: baseos + base_url: "{{ dnf_repos_rocky_ark_prefix }}/BaseOS/{{ dnf_repos_rocky_ark_suffix }}" +- file: rocky + name: appstream + base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" +- file: rocky + name: crb + base_url: "{{ dnf_repos_rocky_ark_prefix }}/CRB/{{ dnf_repos_rocky_ark_suffix }}" +- file: rocky-extras + name: extras + base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" + +dnf_repos_epel_timestamp: 20240902T080424 +dnf_repos_epel_baseurl: "https://ark.stackhpc.com/pulp/content/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ dnf_repos_epel_timestamp }}" diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml new file mode 100644 index 000000000..f8997b741 --- /dev/null +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -0,0 +1,18 @@ +--- +- name: Disable Pulp repos and remove creds + ansible.builtin.yum_repository: + file: "{{ item.file }}" + name: "{{ item.name }}" + baseurl: "{{ item.base_url }}" + description: "{{ item.name }}" + enabled: false + loop: "{{ dnf_repos_repolist }}" + +- name: Disable EPEL repo and remove creds + ansible.builtin.yum_repository: + name: epel + file: epel + description: epel + baseurl: "{{ dnf_repos_epel_baseurl }}" + gpgcheck: false + enabled: false diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml new file mode 100644 index 000000000..f8cca5600 --- /dev/null +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -0,0 +1,26 @@ +--- + +- name: Replace system repos with Pulp repos + ansible.builtin.yum_repository: + file: "{{ item.file }}" + name: "{{ item.name }}" + baseurl: "{{ item.base_url }}" + description: "{{ item.name }}" + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" + loop: "{{ dnf_repos_repolist }}" + +- name: Install epel-release + # done so that roles installing epel via epel-release don't over-write our changes to the epel repo + ansible.builtin.dnf: + name: epel-release + +- name: Use Pulp EPEL repo + ansible.builtin.yum_repository: + name: epel + file: epel + description: epel + gpgcheck: false + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" + baseurl: "{{ dnf_repos_epel_baseurl }}" diff --git a/ansible/roles/eessi/tasks/main.yaml b/ansible/roles/eessi/tasks/main.yaml index d121b6fdd..c61625b0e 100644 --- a/ansible/roles/eessi/tasks/main.yaml +++ b/ansible/roles/eessi/tasks/main.yaml @@ -10,6 +10,7 @@ - name: Add CVMFS repo dnf: name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm + disable_gpg_check: true - name: Install CVMFS dnf: diff --git a/ansible/roles/openondemand/tasks/main.yml b/ansible/roles/openondemand/tasks/main.yml index 86184f13c..bd5706ecb 100644 --- a/ansible/roles/openondemand/tasks/main.yml +++ b/ansible/roles/openondemand/tasks/main.yml @@ -6,12 +6,19 @@ loop: "{{ openondemand_osc_ood_defaults | dict2items }}" when: (item.key in hostvars[inventory_hostname]) or (item.value) +# osc.ood variables are exposed to play here instead of setting 'public' in include role so that they will still be exposed during runtime +- ansible.builtin.include_vars: + dir: "{{ playbook_dir }}/roles/osc.ood/defaults/main" + +- ansible.builtin.include_vars: + file: "{{ playbook_dir }}/roles/osc.ood/vars/Rocky/{{ ansible_distribution_major_version }}.yml" + # if using PAM auth we need apache installed but NOT started so split the osc.ood role up: - include_role: name: osc.ood tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" - public: yes # Expose the vars from this role to the rest of the play + when: appliances_mode != 'configure' # can't set vars: from a dict hence the workaround above - include_tasks: diff --git a/ansible/roles/openondemand/tasks/pam_auth.yml b/ansible/roles/openondemand/tasks/pam_auth.yml index 0edce622f..6bc4bda36 100644 --- a/ansible/roles/openondemand/tasks/pam_auth.yml +++ b/ansible/roles/openondemand/tasks/pam_auth.yml @@ -1,6 +1,6 @@ # https://osc.github.io/ood-documentation/latest/authentication/pam.html --- -- name: Install Apache PAM module +- name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build yum: name: mod_authnz_pam diff --git a/ansible/roles/openondemand/tasks/vnc_compute.yml b/ansible/roles/openondemand/tasks/vnc_compute.yml index 388e3b3c5..6ec340249 100644 --- a/ansible/roles/openondemand/tasks/vnc_compute.yml +++ b/ansible/roles/openondemand/tasks/vnc_compute.yml @@ -48,6 +48,7 @@ tags: install yum: name: '@Xfce' + when: appliances_mode != 'configure' # dnf group/module installs aren't idempotent so only run during build # - name: Ensure python3.9 installed # dnf: diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 0b7397242..cf282f786 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -25,8 +25,9 @@ tags: - openhpc tasks: - - import_role: + - include_role: name: stackhpc.openhpc + tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" - name: Set locked memory limits on user-facing nodes hosts: diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index bd60015d9..98e366304 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -11,4 +11,13 @@ with_items: - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock" - - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock \ No newline at end of file + - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock + +- hosts: builder + become: yes + tasks: + - name: Disable ark repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: disable_repos.yml + when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 0fdbf9f60..9ea84740d 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -17,3 +17,12 @@ - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/hosts.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/secrets.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/test_user.yml" + +- hosts: builder + become: yes + tasks: + - name: Replace system repos with ark + ansible.builtin.include_role: + name: dnf_repos + tasks_from: set_repos.yml + when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml b/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml index 5aac5f8ad..858dfd9d3 100644 --- a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml +++ b/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml @@ -1,3 +1,13 @@ openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug + +ohpc_default_extra_repos: + "9": [] #overriding to ensure doesn't overwrite ark epel repo + "8": + - name: epel + file: epel + description: "Extra Packages for Enterprise Linux 8 - $basearch" + metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" + gpgcheck: true + gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index f9a2087c8..14c997596 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241118-0918-4538c6df", - "RL9": "openhpc-RL9-241118-0918-4538c6df" + "RL8": "openhpc-RL8-241203-1659-b0558b95", + "RL9": "openhpc-RL9-241203-1659-b0558b95" } } diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 15340820f..2a88f035d 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -6,6 +6,7 @@ appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }} appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it #appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform +appliances_mode: configure # Address(ip/dns) for internal communication between services. This is # normally traffic you do no want to expose to users. diff --git a/environments/common/inventory/group_vars/builder/defaults.yml b/environments/common/inventory/group_vars/builder/defaults.yml index 22042c1bf..b43d9f03c 100644 --- a/environments/common/inventory/group_vars/builder/defaults.yml +++ b/environments/common/inventory/group_vars/builder/defaults.yml @@ -22,3 +22,4 @@ squid_cache_disk: 0 # just needs to be defined squid_cache_mem: 0 tuned_started: false tuned_enabled: false +appliances_mode: build From 9723782e7fcb284945e67a17aec1a756f708f89b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 13:04:29 +0000 Subject: [PATCH 042/182] testing builds with leafcloud pulp --- ansible/roles/dnf_repos/defaults/main.yml | 48 ++++++++++++++----- ansible/roles/dnf_repos/tasks/set_repos.yml | 4 -- .../inventory/group_vars/builder.yml | 1 + 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index a3e05d0e1..b997605ea 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -1,25 +1,47 @@ -dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/{{ ansible_distribution_version }} -dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_rocky_ark_timestamp }}/" -# most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml -# note that some timestamps can't be used because not all repos have snapshots for them -dnf_repos_rocky_ark_timestamp: 20240816T002610 -dnf_repos_username: slurm-app-ci -dnf_repos_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" +# dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/{{ ansible_distribution_version }} +# dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_rocky_ark_timestamp }}/" +# # most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml +# # note that some timestamps can't be used because not all repos have snapshots for them +# dnf_repos_rocky_ark_timestamp: 20240816T002610 +# dnf_repos_username: slurm-app-ci +# dnf_repos_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" + +# # epel installed separately +# dnf_repos_repolist: +# - file: rocky +# name: baseos +# base_url: "{{ dnf_repos_rocky_ark_prefix }}/BaseOS/{{ dnf_repos_rocky_ark_suffix }}" +# - file: rocky +# name: appstream +# base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" +# - file: rocky +# name: crb +# base_url: "{{ dnf_repos_rocky_ark_prefix }}/CRB/{{ dnf_repos_rocky_ark_suffix }}" +# - file: rocky-extras +# name: extras +# base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" + +# dnf_repos_epel_timestamp: 20240902T080424 +# dnf_repos_epel_baseurl: "https://ark.stackhpc.com/pulp/content/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ dnf_repos_epel_timestamp }}" + +dnf_repos_pulp_url: # required +dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" +dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" +dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" # epel installed separately dnf_repos_repolist: - file: rocky name: baseos - base_url: "{{ dnf_repos_rocky_ark_prefix }}/BaseOS/{{ dnf_repos_rocky_ark_suffix }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/baseos" - file: rocky name: appstream - base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/appstream" - file: rocky name: crb - base_url: "{{ dnf_repos_rocky_ark_prefix }}/CRB/{{ dnf_repos_rocky_ark_suffix }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/crb" - file: rocky-extras name: extras - base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras" -dnf_repos_epel_timestamp: 20240902T080424 -dnf_repos_epel_baseurl: "https://ark.stackhpc.com/pulp/content/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ dnf_repos_epel_timestamp }}" +dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_major_version }}" diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index f8cca5600..8a8364097 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -6,8 +6,6 @@ name: "{{ item.name }}" baseurl: "{{ item.base_url }}" description: "{{ item.name }}" - username: "{{ dnf_repos_username }}" - password: "{{ dnf_repos_password }}" loop: "{{ dnf_repos_repolist }}" - name: Install epel-release @@ -21,6 +19,4 @@ file: epel description: epel gpgcheck: false - username: "{{ dnf_repos_username }}" - password: "{{ dnf_repos_password }}" baseurl: "{{ dnf_repos_epel_baseurl }}" diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 8d7ee98d2..1a65daa48 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1 +1,2 @@ #update_enable: false # Can uncomment for speed debugging non-update related build issues +dnf_repos_pulp_url: http://192.168.10.157:8080 From 127b79210af6d806c82674d4a0cbe64eb07e3fff Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 15:12:50 +0000 Subject: [PATCH 043/182] pulp integration --- ansible/.gitignore | 2 + ansible/adhoc/deploy-pulp.yml | 25 +++++++ ansible/bootstrap.yml | 17 +++++ ansible/roles/dnf_repos/defaults/main.yml | 28 +------ ansible/roles/passwords/defaults/main.yml | 1 + ansible/roles/pulp_site/defaults/main.yml | 75 +++++++++++++++++++ ansible/roles/pulp_site/tasks/install.yml | 43 +++++++++++ ansible/roles/pulp_site/tasks/sync.yml | 73 ++++++++++++++++++ ansible/roles/pulp_site/templates/cli.toml.j2 | 14 ++++ .../roles/pulp_site/templates/settings.py.j2 | 2 + ansible/site.yml | 9 +++ environments/.stackhpc/hooks/post.yml | 9 --- environments/.stackhpc/hooks/pre.yml | 9 --- .../inventory/group_vars/builder.yml | 2 +- .../inventory/group_vars/all/defaults.yml | 1 + environments/common/inventory/groups | 4 + requirements.txt | 3 +- requirements.yml | 2 + 18 files changed, 272 insertions(+), 47 deletions(-) create mode 100644 ansible/adhoc/deploy-pulp.yml create mode 100644 ansible/roles/pulp_site/defaults/main.yml create mode 100644 ansible/roles/pulp_site/tasks/install.yml create mode 100644 ansible/roles/pulp_site/tasks/sync.yml create mode 100644 ansible/roles/pulp_site/templates/cli.toml.j2 create mode 100644 ansible/roles/pulp_site/templates/settings.py.j2 diff --git a/ansible/.gitignore b/ansible/.gitignore index 48c917c4f..4eba25fa9 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -66,3 +66,5 @@ roles/* !roles/lustre/** !roles/dnf_repos/ !roles/dnf_repos/** +!roles/pulp_site/ +!roles/pulp_site/** diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml new file mode 100644 index 000000000..291da7f59 --- /dev/null +++ b/ansible/adhoc/deploy-pulp.yml @@ -0,0 +1,25 @@ +# Usage: ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server=" + +- name: Add temporary pulp server host + hosts: localhost + tasks: + - ansible.builtin.add_host: + name: "{{ pulp_server }}" + group: "_pulp_host" + +- name: Install pulp on server + become: yes + hosts: _pulp_host + tasks: + - ansible.builtin.import_role: + name: pulp_site + tasks_from: install.yml + +- name: Add pulp host to environment + hosts: localhost + tasks: + - ansible.builtin.copy: + dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/pulp_server.yml" + content: | + # ansible managed + appliances_pulp_server: "http://{{ pulp_server }}" diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 733d4b3f8..cc3cf7a12 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -110,6 +110,23 @@ policy: "{{ selinux_policy }}" register: sestatus +- name: Sync pulp repos with upstream + hosts: localhost + tasks: + - ansible.builtin.include_role: + name: pulp_site + tasks_from: sync.yml + when: appliances_mode != 'configure' + +- hosts: dnf_repos + become: yes + tasks: + - name: Replace system repos with pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: set_repos.yml + when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided + # --- tasks after here require access to package repos --- - hosts: squid tags: squid diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index b997605ea..24bb4852b 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -1,30 +1,4 @@ -# dnf_repos_rocky_ark_prefix: https://ark.stackhpc.com/pulp/content/{{ ansible_distribution | lower }}/{{ ansible_distribution_version }} -# dnf_repos_rocky_ark_suffix: "{{ ansible_architecture }}/os/{{ dnf_repos_rocky_ark_timestamp }}/" -# # most stable from https://github.com/stackhpc/stackhpc-kayobe-config/blob/stackhpc/2024.1/etc/kayobe/pulp-repo-versions.yml -# # note that some timestamps can't be used because not all repos have snapshots for them -# dnf_repos_rocky_ark_timestamp: 20240816T002610 -# dnf_repos_username: slurm-app-ci -# dnf_repos_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" - -# # epel installed separately -# dnf_repos_repolist: -# - file: rocky -# name: baseos -# base_url: "{{ dnf_repos_rocky_ark_prefix }}/BaseOS/{{ dnf_repos_rocky_ark_suffix }}" -# - file: rocky -# name: appstream -# base_url: "{{ dnf_repos_rocky_ark_prefix }}/AppStream/{{ dnf_repos_rocky_ark_suffix }}" -# - file: rocky -# name: crb -# base_url: "{{ dnf_repos_rocky_ark_prefix }}/CRB/{{ dnf_repos_rocky_ark_suffix }}" -# - file: rocky-extras -# name: extras -# base_url: "{{ dnf_repos_rocky_ark_prefix }}/extras/{{ dnf_repos_rocky_ark_suffix }}" - -# dnf_repos_epel_timestamp: 20240902T080424 -# dnf_repos_epel_baseurl: "https://ark.stackhpc.com/pulp/content/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ dnf_repos_epel_timestamp }}" - -dnf_repos_pulp_url: # required +dnf_repos_pulp_url: "{{ appliances_pulp_url }}" dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index d9a339efd..2587e8499 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -9,6 +9,7 @@ slurm_appliance_secrets: vault_freeipa_ds_password: "{{ vault_freeipa_ds_password | default(lookup('password', '/dev/null')) }}" vault_freeipa_admin_password: "{{ vault_freeipa_admin_password | default(lookup('password', '/dev/null')) }}" vault_k3s_token: "{{ vault_k3s_token | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}" + vault_pulp_admin_password: "{{ vault_pulp_admin_password | default(lookup('password', '/dev/null', chars=['ascii_letters', 'digits'])) }}" secrets_openhpc_mungekey_default: content: "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') }}" diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml new file mode 100644 index 000000000..077871263 --- /dev/null +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -0,0 +1,75 @@ +pulp_site_url: "http://{{ appliances_pulp_url }}:{{ pulp_site_port }}" +pulp_site_port: 8080 +pulp_site_username: admin # shouldn't be changed +pulp_site_upstream_username: slurm-app-ci +pulp_site_upstream_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" +pulp_site_password: "{{ vault_pulp_admin_password }}" +pulp_site_validate_certs: false +pulp_site_install_dir: '/home/rocky/pulp' +pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" + +pulp_site_rpm_repos: + - name: baseos + url: https://ark.stackhpc.com/pulp/content/rocky/9.4/BaseOS/x86_64/os/20240816T002610 + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand + state: present + - name: appstream + url: https://ark.stackhpc.com/pulp/content/rocky/9.4/AppStream/x86_64/os/20240816T002610 + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand + state: present + - name: crb + url: https://ark.stackhpc.com/pulp/content/rocky/9.4/CRB/x86_64/os/20240816T002610 + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand + state: present + - name: extras + url: https://ark.stackhpc.com/pulp/content/rocky/9.4/extras/x86_64/os/20240816T002610 + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand + state: present + - name: epel + url: https://ark.stackhpc.com/pulp/content/epel/9/Everything/x86_64/20240902T080424 + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand + state: present + +pulp_site_rpm_publications: +- repository: baseos + state: present +- repository: appstream + state: present +- repository: crb + state: present +- repository: extras + state: present +- repository: epel + state: present + +pulp_site_rpm_distributions: +- name: baseos + base_path: rocky/9.4/baseos + repository: baseos + state: present +- name: appstream + base_path: rocky/9.4/appstream + repository: appstream + state: present +- name: crb + base_path: rocky/9.4/crb + repository: crb + state: present +- name: extras + base_path: rocky/9.4/extras + repository: extras + state: present +- name: epel + base_path: epel/9 + repository: epel + state: present diff --git a/ansible/roles/pulp_site/tasks/install.yml b/ansible/roles/pulp_site/tasks/install.yml new file mode 100644 index 000000000..39b4fcd97 --- /dev/null +++ b/ansible/roles/pulp_site/tasks/install.yml @@ -0,0 +1,43 @@ +--- + +- name: Install packages + dnf: + name: + - podman + +- name: Create install directories + ansible.builtin.file: + state: directory + path: "{{ pulp_site_install_dir }}/{{ item }}" + loop: + - settings/certs + - pulp_storage + - pgsql + - containers + +- name: Template settings file + ansible.builtin.template: + src: settings.py.j2 + dest: "{{ pulp_site_install_dir }}/settings/settings.py" + +- name: Install pulp podman container + containers.podman.podman_container: + name: pulp + publish: + - "{{ pulp_site_port }}:80" + volume: + - "{{ pulp_site_install_dir }}/settings:/etc/pulp{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/pulp_storage:/var/lib/pulp{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/pgsql:/var/lib/pgsql{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/containers:/var/lib/containers{{ pulp_site_selinux_suffix }}" + device: /dev/fuse + image: docker.io/pulp/pulp:3.68.1 + +- name: Reset admin password once container has initialised + no_log: true + ansible.builtin.shell: + cmd: "podman exec pulp bash -c 'pulpcore-manager reset-admin-password -p {{ pulp_site_password }}'" + register: _admin_reset_output + until: 0 == _admin_reset_output.rc + retries: 6 + delay: 30 diff --git a/ansible/roles/pulp_site/tasks/sync.yml b/ansible/roles/pulp_site/tasks/sync.yml new file mode 100644 index 000000000..62395f0f3 --- /dev/null +++ b/ansible/roles/pulp_site/tasks/sync.yml @@ -0,0 +1,73 @@ +--- + +- name: Wait for Pulp server + pulp.squeezer.status: + pulp_url: "{{ pulp_site_url }}" + username: "{{ pulp_site_username }}" + password: "{{ pulp_site_password }}" + register: _pulp_status + until: _pulp_status.failed == false + retries: 30 + delay: 20 + +- name: Ensure Pulp CLI config directory exists + ansible.builtin.file: + path: ~/.config/pulp + state: directory + +- name: Create config file + no_log: true + ansible.builtin.template: + src: cli.toml.j2 + dest: ~/.config/pulp/cli.toml + mode: '0644' + +- block: + - name: Ensure squeezer cache exists + ansible.builtin.file: + path: "{{ _cache_dir }}" + state: directory + + - name: Check if squeezer cache is populated + ansible.builtin.stat: + path: "{{ _cache_dir }}/api.json" + register: _cache_stat + + - name: Prepopulate squeezer cache # workaround for race on the cache + ansible.builtin.get_url: + url: "{{ pulp_site_url }}/pulp/api/v3/docs/api.json" + dest: "{{ _cache_dir }}/api.json" + timeout: 40 + when: not _cache_stat.stat.exists + vars: + _cache_dir: "~/.cache/squeezer/{{ pulp_site_url | regex_replace( ':|/' , '_' ) }}" + +- name: Get Pulp repos from release train + ansible.builtin.include_role: + name: stackhpc.pulp.pulp_repository + tasks_from: rpm.yml + vars: + pulp_url: "{{ pulp_site_url }}" + pulp_username: "{{ pulp_site_username }}" + pulp_password: "{{ pulp_site_password }}" + pulp_repository_rpm_repos: "{{ pulp_site_rpm_repos }}" + +- name: Create Pulp publications + ansible.builtin.include_role: + name: stackhpc.pulp.pulp_publication + tasks_from: rpm.yml + vars: + pulp_url: "{{ pulp_site_url }}" + pulp_username: "{{ pulp_site_username }}" + pulp_password: "{{ pulp_site_password }}" + pulp_publication_rpm: "{{ pulp_site_rpm_publications }}" + +- name: Create Pulp distributions + ansible.builtin.include_role: + name: stackhpc.pulp.pulp_distribution + tasks_from: rpm.yml + vars: + pulp_url: "{{ pulp_site_url }}" + pulp_username: "{{ pulp_site_username }}" + pulp_password: "{{ pulp_site_password }}" + pulp_distribution_rpm: "{{ pulp_site_rpm_distributions }}" diff --git a/ansible/roles/pulp_site/templates/cli.toml.j2 b/ansible/roles/pulp_site/templates/cli.toml.j2 new file mode 100644 index 000000000..06867902f --- /dev/null +++ b/ansible/roles/pulp_site/templates/cli.toml.j2 @@ -0,0 +1,14 @@ +[cli] +base_url = "{{ pulp_site_url }}" +username = "{{ pulp_site_username }}" +password = "{{ pulp_site_password }}" +api_root = "/pulp/" +domain = "default" +headers = [] +cert = "" +key = "" +verify_ssl = true +format = "json" +dry_run = false +timeout = 0 +verbose = 0 diff --git a/ansible/roles/pulp_site/templates/settings.py.j2 b/ansible/roles/pulp_site/templates/settings.py.j2 new file mode 100644 index 000000000..200212e2c --- /dev/null +++ b/ansible/roles/pulp_site/templates/settings.py.j2 @@ -0,0 +1,2 @@ +CONTENT_ORIGIN='http://{{ ansible_fqdn }}:{{ pulp_site_port }}' +TOKEN_AUTH_DISABLED=True diff --git a/ansible/site.yml b/ansible/site.yml index bb379399d..a09d5a510 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -28,6 +28,15 @@ - import_playbook: portal.yml - import_playbook: monitoring.yml +- hosts: dnf_repos + become: yes + tasks: + - name: Disable pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: disable_repos.yml + when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided + - name: Run post.yml hook vars: # hostvars not available here, so have to recalculate environment root: diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index 98e366304..9d506d725 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -12,12 +12,3 @@ - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock" - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock - -- hosts: builder - become: yes - tasks: - - name: Disable ark repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: disable_repos.yml - when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 9ea84740d..0fdbf9f60 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -17,12 +17,3 @@ - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/hosts.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/secrets.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/test_user.yml" - -- hosts: builder - become: yes - tasks: - - name: Replace system repos with ark - ansible.builtin.include_role: - name: dnf_repos - tasks_from: set_repos.yml - when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 1a65daa48..50ef3d76c 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,2 +1,2 @@ #update_enable: false # Can uncomment for speed debugging non-update related build issues -dnf_repos_pulp_url: http://192.168.10.157:8080 +dnf_repos_pulp_url: http://192.168.10.157 diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 2a88f035d..a7bb92ee3 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -7,6 +7,7 @@ appliances_environment_name: "{{ appliances_environment_root | basename | regex_ appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it #appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform appliances_mode: configure +#appliances_pulp_url: #override required # Address(ip/dns) for internal communication between services. This is # normally traffic you do no want to expose to users. diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 9b9aa5bf0..a88ccf338 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -144,3 +144,7 @@ freeipa_client [lustre] # Hosts to run lustre client + +[dnf_repos:children] +# Hosts to replace system repos with Pulp repos +cluster diff --git a/requirements.txt b/requirements.txt index 6651506fb..7d81f3285 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ansible==6.0.0 +ansible==8.0.0 openstacksdk python-openstackclient==6.6.1 # v7.0.0 has a bug re. rebuild python-manilaclient @@ -9,3 +9,4 @@ cookiecutter selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3 netaddr matplotlib +pulp-cli==0.29.2 diff --git a/requirements.yml b/requirements.yml index 3d8c44011..6a461a6fa 100644 --- a/requirements.yml +++ b/requirements.yml @@ -49,4 +49,6 @@ collections: - name: https://github.com/azimuth-cloud/ansible-collection-image-utils type: git version: 0.4.0 + - name: stackhpc.pulp + version: 0.5.5 ... From 0d8a440e742fdbe985925dd5073c341135999567 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 15:19:21 +0000 Subject: [PATCH 044/182] typos --- ansible/adhoc/deploy-pulp.yml | 2 +- environments/.stackhpc/inventory/group_vars/builder.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml index 291da7f59..cbb6bb6f6 100644 --- a/ansible/adhoc/deploy-pulp.yml +++ b/ansible/adhoc/deploy-pulp.yml @@ -22,4 +22,4 @@ dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/pulp_server.yml" content: | # ansible managed - appliances_pulp_server: "http://{{ pulp_server }}" + appliances_pulp_url: "http://{{ pulp_server }}" diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 50ef3d76c..f32bd2928 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,2 +1,2 @@ #update_enable: false # Can uncomment for speed debugging non-update related build issues -dnf_repos_pulp_url: http://192.168.10.157 +appliances_pulp_url: http://192.168.10.157 From 90a33fa3563c70c912dcaf821bfe91763c8cad9c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 15:26:58 +0000 Subject: [PATCH 045/182] missed merge conflict --- environments/.stackhpc/hooks/post.yml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/environments/.stackhpc/hooks/post.yml b/environments/.stackhpc/hooks/post.yml index 31dafd8c1..9d506d725 100644 --- a/environments/.stackhpc/hooks/post.yml +++ b/environments/.stackhpc/hooks/post.yml @@ -12,15 +12,3 @@ - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock" - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock -<<<<<<< HEAD -======= - -- hosts: builder - become: yes - tasks: - - name: Disable ark repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: disable_repos.yml - when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided ->>>>>>> main From eaa3680596fb1a717b9fbca3342ce58e43a43ca9 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 15:39:00 +0000 Subject: [PATCH 046/182] moved pulp port into url --- ansible/roles/pulp_site/defaults/main.yml | 2 +- environments/.stackhpc/inventory/group_vars/builder.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index 077871263..d4fa8aef6 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -1,4 +1,4 @@ -pulp_site_url: "http://{{ appliances_pulp_url }}:{{ pulp_site_port }}" +pulp_site_url: "http://{{ appliances_pulp_url }}" pulp_site_port: 8080 pulp_site_username: admin # shouldn't be changed pulp_site_upstream_username: slurm-app-ci diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index f32bd2928..609e5a0c4 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,2 +1,2 @@ #update_enable: false # Can uncomment for speed debugging non-update related build issues -appliances_pulp_url: http://192.168.10.157 +appliances_pulp_url: http://192.168.10.157:8080 From 9a75656497096cfc5acafbdb4bdefd1ac80e7b8e Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 16:07:40 +0000 Subject: [PATCH 047/182] fixed port not getting added in adhoc --- ansible/adhoc/deploy-pulp.yml | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml index cbb6bb6f6..ad453e3f7 100644 --- a/ansible/adhoc/deploy-pulp.yml +++ b/ansible/adhoc/deploy-pulp.yml @@ -7,19 +7,31 @@ name: "{{ pulp_server }}" group: "_pulp_host" -- name: Install pulp on server +- name: Install pulp on server and add to config become: yes hosts: _pulp_host tasks: - - ansible.builtin.import_role: + + - name: Install pulp + ansible.builtin.include_role: name: pulp_site tasks_from: install.yml + public: true -- name: Add pulp host to environment - hosts: localhost - tasks: - - ansible.builtin.copy: + - name: Add pulp host to environment + become: no + delegate_to: localhost + ansible.builtin.copy: dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/pulp_server.yml" content: | # ansible managed - appliances_pulp_url: "http://{{ pulp_server }}" + appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" + +# - name: Add pulp host to environment +# hosts: localhost +# tasks: +# - ansible.builtin.copy: +# dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/pulp_server.yml" +# content: | +# # ansible managed +# appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" From 741872a7ee74e143241e5afd621b6442f99623e8 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 16:15:23 +0000 Subject: [PATCH 048/182] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 14c997596..5e71beebd 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241203-1659-b0558b95", - "RL9": "openhpc-RL9-241203-1659-b0558b95" + "RL8": "openhpc-RL8-241206-1541-eaa36805", + "RL9": "openhpc-RL9-241206-1541-eaa36805" } } From 39cf55682d8324733fbe12cdd0d3291e6f312fed Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 6 Dec 2024 16:38:34 +0000 Subject: [PATCH 049/182] cleaned up disabling repos + now optional --- ansible/roles/dnf_repos/defaults/main.yml | 1 + .../roles/dnf_repos/tasks/disable_repos.yml | 20 ++++--------------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 24bb4852b..359814e47 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -2,6 +2,7 @@ dnf_repos_pulp_url: "{{ appliances_pulp_url }}" dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" +dnf_repos_disable: true # epel installed separately dnf_repos_repolist: diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml index f8997b741..69aed3b6b 100644 --- a/ansible/roles/dnf_repos/tasks/disable_repos.yml +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -1,18 +1,6 @@ --- - name: Disable Pulp repos and remove creds - ansible.builtin.yum_repository: - file: "{{ item.file }}" - name: "{{ item.name }}" - baseurl: "{{ item.base_url }}" - description: "{{ item.name }}" - enabled: false - loop: "{{ dnf_repos_repolist }}" - -- name: Disable EPEL repo and remove creds - ansible.builtin.yum_repository: - name: epel - file: epel - description: epel - baseurl: "{{ dnf_repos_epel_baseurl }}" - gpgcheck: false - enabled: false + ansible.builtin.yum: + disablerepo: "{{ item.name }}" + loop: "{{ dnf_repos_repolist + [epel] }}" + when: dnf_repos_disable From 25644c362b93e1a8242aaca3e992c79ecd01d3bc Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 9 Dec 2024 12:05:48 +0000 Subject: [PATCH 050/182] typo --- ansible/roles/pulp_site/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index d4fa8aef6..f648696e3 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -1,4 +1,4 @@ -pulp_site_url: "http://{{ appliances_pulp_url }}" +pulp_site_url: "{{ appliances_pulp_url }}" pulp_site_port: 8080 pulp_site_username: admin # shouldn't be changed pulp_site_upstream_username: slurm-app-ci From fef3d566dcc40f1dad2cbab5fc2fb7d07d66eff3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 11 Dec 2024 14:53:15 +0000 Subject: [PATCH 051/182] repos now timestamped + synced at bootstrap --- .github/workflows/fatimage.yml | 1 + .github/workflows/nightlybuild.yml | 1 + ansible/adhoc/deploy-pulp.yml | 9 -- ansible/bootstrap.yml | 6 +- ansible/roles/dnf_repos/defaults/main.yml | 11 ++- ansible/roles/pulp_site/defaults/main.yml | 99 +++++++++---------- environments/.stackhpc/hooks/pre.yml | 9 -- .../inventory/group_vars/builder.yml | 10 +- .../inventory/group_vars/all/defaults.yml | 8 ++ environments/common/inventory/groups | 5 + 10 files changed, 82 insertions(+), 77 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 217b09c22..3a32f47b2 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -40,6 +40,7 @@ jobs: } } ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} + LEAFCLOUD_PULP_PASSWORD: ${{ secrets.LEAFCLOUD_PULP_PASSWORD }} steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 9f45b0890..ee2b4b6f8 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -38,6 +38,7 @@ jobs: "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" } ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} + LEAFCLOUD_PULP_PASSWORD: ${{ secrets.LEAFCLOUD_PULP_PASSWORD }} steps: - uses: actions/checkout@v2 diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml index ad453e3f7..89c51922a 100644 --- a/ansible/adhoc/deploy-pulp.yml +++ b/ansible/adhoc/deploy-pulp.yml @@ -26,12 +26,3 @@ content: | # ansible managed appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" - -# - name: Add pulp host to environment -# hosts: localhost -# tasks: -# - ansible.builtin.copy: -# dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/pulp_server.yml" -# content: | -# # ansible managed -# appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index cc3cf7a12..dfe212d02 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -111,11 +111,15 @@ register: sestatus - name: Sync pulp repos with upstream - hosts: localhost + hosts: pulp tasks: + - debug: + var: hostvars[groups['builder'][0]]['ansible_facts'] - ansible.builtin.include_role: name: pulp_site tasks_from: sync.yml + apply: + delegate_to: localhost when: appliances_mode != 'configure' - hosts: dnf_repos diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 359814e47..0a09e5f3a 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -3,20 +3,21 @@ dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" dnf_repos_disable: true +dnf_repos_version_timestamps: "{{ appliances_repo_timestamps[ansible_distribution_version] }}" # epel installed separately dnf_repos_repolist: - file: rocky name: baseos - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/baseos" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/baseos/{{ dnf_repos_version_timestamps.baseos }}" - file: rocky name: appstream - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/appstream" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/appstream/{{ dnf_repos_version_timestamps.appstream }}" - file: rocky name: crb - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/crb" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/crb/{{ dnf_repos_version_timestamps.crb }}" - file: rocky-extras name: extras - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ dnf_repos_version_timestamps.extras }}" -dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_major_version }}" +dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_version }}/{{ dnf_repos_version_timestamps.epel }}" diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index f648696e3..0fc92859a 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -3,73 +3,70 @@ pulp_site_port: 8080 pulp_site_username: admin # shouldn't be changed pulp_site_upstream_username: slurm-app-ci pulp_site_upstream_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" +pulp_site_default_upstream_prefix: "https://ark.stackhpc.com/pulp/content/{{ pulp_site_target_distribution }}/{{ pulp_site_target_distribution_version }}" +pulp_site_default_upstream_suffix: "{{ pulp_site_target_arch }}/os" pulp_site_password: "{{ vault_pulp_admin_password }}" pulp_site_validate_certs: false pulp_site_install_dir: '/home/rocky/pulp' pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" +pulp_site_target_facts: "{{ hostvars[groups['builder'][0]]['ansible_facts'] }}" +pulp_site_target_arch: "{{ pulp_site_target_facts['architecture'] }}" +pulp_site_target_distribution: "{{ pulp_site_target_facts['distribution'] | lower }}" +pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" +pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" +pulp_site_version_timestamps: "{{ appliances_repo_timestamps[pulp_site_target_distribution_version] }}" -pulp_site_rpm_repos: - - name: baseos - url: https://ark.stackhpc.com/pulp/content/rocky/9.4/BaseOS/x86_64/os/20240816T002610 - remote_username: "{{ pulp_site_upstream_username }}" - remote_password: "{{ pulp_site_upstream_password }}" - policy: on_demand - state: present - - name: appstream - url: https://ark.stackhpc.com/pulp/content/rocky/9.4/AppStream/x86_64/os/20240816T002610 - remote_username: "{{ pulp_site_upstream_username }}" - remote_password: "{{ pulp_site_upstream_password }}" - policy: on_demand - state: present - - name: crb - url: https://ark.stackhpc.com/pulp/content/rocky/9.4/CRB/x86_64/os/20240816T002610 - remote_username: "{{ pulp_site_upstream_username }}" - remote_password: "{{ pulp_site_upstream_password }}" - policy: on_demand - state: present - - name: extras - url: https://ark.stackhpc.com/pulp/content/rocky/9.4/extras/x86_64/os/20240816T002610 - remote_username: "{{ pulp_site_upstream_username }}" - remote_password: "{{ pulp_site_upstream_password }}" - policy: on_demand - state: present - - name: epel - url: https://ark.stackhpc.com/pulp/content/epel/9/Everything/x86_64/20240902T080424 - remote_username: "{{ pulp_site_upstream_username }}" - remote_password: "{{ pulp_site_upstream_password }}" - policy: on_demand - state: present + +pulp_site_rpm_repo_list: + - name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" + url: "{{ pulp_site_default_upstream_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.baseos }}" + - name: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" + url: "{{ pulp_site_default_upstream_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.appstream }}" + - name: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" + url: "{{ pulp_site_default_upstream_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.crb }}" + - name: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" + url: "{{ pulp_site_default_upstream_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" + - name: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" + url: "https://ark.stackhpc.com/pulp/content/epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_version_timestamps.epel }}" + +pulp_site_defaults: + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand + state: present + +pulp_site_rpm_repos: "{{ pulp_site_rpm_repo_list | map('combine', pulp_site_defaults) }}" pulp_site_rpm_publications: -- repository: baseos +- repository: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" state: present -- repository: appstream +- repository: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" state: present -- repository: crb +- repository: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" state: present -- repository: extras +- repository: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" state: present -- repository: epel +- repository: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" state: present pulp_site_rpm_distributions: -- name: baseos - base_path: rocky/9.4/baseos - repository: baseos +- name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" + base_path: "rocky/{{ pulp_site_target_distribution_version }}/baseos/{{ pulp_site_version_timestamps.baseos }}" + repository: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" state: present -- name: appstream - base_path: rocky/9.4/appstream - repository: appstream +- name: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" + base_path: "rocky/{{ pulp_site_target_distribution_version }}/appstream/{{ pulp_site_version_timestamps.appstream }}" + repository: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" state: present -- name: crb - base_path: rocky/9.4/crb - repository: crb +- name: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" + base_path: "rocky/{{ pulp_site_target_distribution_version }}/crb/{{ pulp_site_version_timestamps.crb }}" + repository: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" state: present -- name: extras - base_path: rocky/9.4/extras - repository: extras +- name: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" + base_path: "rocky/{{ pulp_site_target_distribution_version }}/extras/{{ pulp_site_version_timestamps.extras }}" + repository: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" state: present -- name: epel - base_path: epel/9 - repository: epel +- name: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" + base_path: "epel/{{ pulp_site_target_distribution_version }}/{{ pulp_site_version_timestamps.epel }}" + repository: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" state: present diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 9ea84740d..0fdbf9f60 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -17,12 +17,3 @@ - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/hosts.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/secrets.yml" - "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/test_user.yml" - -- hosts: builder - become: yes - tasks: - - name: Replace system repos with ark - ansible.builtin.include_role: - name: dnf_repos - tasks_from: set_repos.yml - when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 609e5a0c4..0fd19e1f9 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,2 +1,8 @@ -#update_enable: false # Can uncomment for speed debugging non-update related build issues -appliances_pulp_url: http://192.168.10.157:8080 +# update_enable: false # Can uncomment for speed debugging non-update related build issues +pulp_server_config: + LEAFCLOUD: + url: http://192.168.10.157:8080 + password: lookup('env','LEAFCLOUD_PULP_PASSWORD') + +appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" +pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index a7bb92ee3..9d8a7ab33 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -81,3 +81,11 @@ appliances_local_users_extra: [] # see format of appliances_local_users_default appliances_local_users: "{{ appliances_local_users_default + appliances_local_users_extra }}" ########################################################################################### + +appliances_repo_timestamps: + '9.4': + baseos: 20240816T002610 + appstream: 20240816T002610 + crb: 20240816T002610 + extras: 20240816T002610 + epel: 20240902T080424 diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index a88ccf338..fbfcfa0ca 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -148,3 +148,8 @@ freeipa_client [dnf_repos:children] # Hosts to replace system repos with Pulp repos cluster +builder + +[pulp:children] +# Hosts used to run Pulp API commands +builder From 1c4a511eeb9b7102941cf116fba78c978bd68c48 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 11 Dec 2024 17:08:16 +0000 Subject: [PATCH 052/182] refactored pulp_site list --- ansible/roles/pulp_site/defaults/main.yml | 60 ++++++------------- .../filter_plugins/pulp-list-filters.py | 31 ++++++++++ 2 files changed, 48 insertions(+), 43 deletions(-) create mode 100644 ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index 0fc92859a..6a9e98d74 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -16,57 +16,31 @@ pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_ pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" pulp_site_version_timestamps: "{{ appliances_repo_timestamps[pulp_site_target_distribution_version] }}" - -pulp_site_rpm_repo_list: - - name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" - url: "{{ pulp_site_default_upstream_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.baseos }}" - - name: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" - url: "{{ pulp_site_default_upstream_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.appstream }}" - - name: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" - url: "{{ pulp_site_default_upstream_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.crb }}" - - name: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" - url: "{{ pulp_site_default_upstream_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" - - name: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" - url: "https://ark.stackhpc.com/pulp/content/epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_version_timestamps.epel }}" - -pulp_site_defaults: - remote_username: "{{ pulp_site_upstream_username }}" - remote_password: "{{ pulp_site_upstream_password }}" - policy: on_demand - state: present - -pulp_site_rpm_repos: "{{ pulp_site_rpm_repo_list | map('combine', pulp_site_defaults) }}" - -pulp_site_rpm_publications: -- repository: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" - state: present -- repository: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" - state: present -- repository: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" - state: present -- repository: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" - state: present -- repository: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" - state: present - -pulp_site_rpm_distributions: +pulp_site_rpm_info: - name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" + url: "{{ pulp_site_default_upstream_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.baseos }}" base_path: "rocky/{{ pulp_site_target_distribution_version }}/baseos/{{ pulp_site_version_timestamps.baseos }}" - repository: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" - state: present - name: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" + url: "{{ pulp_site_default_upstream_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.appstream }}" base_path: "rocky/{{ pulp_site_target_distribution_version }}/appstream/{{ pulp_site_version_timestamps.appstream }}" - repository: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" - state: present - name: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" + url: "{{ pulp_site_default_upstream_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.crb }}" base_path: "rocky/{{ pulp_site_target_distribution_version }}/crb/{{ pulp_site_version_timestamps.crb }}" - repository: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" - state: present - name: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" + url: "{{ pulp_site_default_upstream_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" base_path: "rocky/{{ pulp_site_target_distribution_version }}/extras/{{ pulp_site_version_timestamps.extras }}" - repository: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" - state: present - name: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" + url: "https://ark.stackhpc.com/pulp/content/epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_version_timestamps.epel }}" base_path: "epel/{{ pulp_site_target_distribution_version }}/{{ pulp_site_version_timestamps.epel }}" - repository: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" + +pulp_site_rpm_repo_defaults: + remote_username: "{{ pulp_site_upstream_username }}" + remote_password: "{{ pulp_site_upstream_password }}" + policy: on_demand state: present + +_pulp_site_rpm_info_all: "{{ pulp_site_rpm_info | map('combine', pulp_site_rpm_repo_defaults) }}" + +pulp_site_rpm_repos: "{{ _pulp_site_rpm_info_all | to_rpm_repos }}" +pulp_site_rpm_publications: "{{ _pulp_site_rpm_info_all | to_rpm_pubs }}" +pulp_site_rpm_distributions: "{{ _pulp_site_rpm_info_all | to_rpm_distros }}" diff --git a/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py new file mode 100644 index 000000000..94d89d184 --- /dev/null +++ b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py @@ -0,0 +1,31 @@ +class FilterModule(object): + def filters(self): + return { + 'to_rpm_repos': self.to_rpm_repos, + 'to_rpm_pubs': self.to_rpm_pubs, + 'to_rpm_distros': self.to_rpm_distros + } + + def to_rpm_repos(self, list): + repo_list = map(lambda x: { + 'name': x['name'], + 'url': x['url'], + 'remote_username': x['remote_username'], + 'remote_password': x['remote_password'], + 'policy': x['policy'], + 'state': x['state'] }, list) + return repo_list + + def to_rpm_pubs(self, list): + pub_list = map(lambda x: { + 'repository': x['name'], + 'state': x['state'] }, list) + return pub_list + + def to_rpm_distros(self, list): + distro_list = map(lambda x: { + 'name': x['name'], + 'repository': x['name'], + 'base_path': x['base_path'], + 'state': x['state'] }, list) + return distro_list \ No newline at end of file From 558874b3949253a34baec5e91d917d7965710725 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 11:46:24 +0000 Subject: [PATCH 053/182] Added extra package installs to bootstrap --- ansible/bootstrap.yml | 9 ++++++++ .../inventory/group_vars/all/defaults.yml | 21 ++++++++++++++++++- environments/common/inventory/groups | 5 +++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 733d4b3f8..432a2a319 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -216,6 +216,15 @@ msg: "{{ updates.results | length }} changes to packages - see {{ update_log_path }} for details" when: "update_enable | default('false') | bool" +- hosts: extra_packages + become: yes + tags: + - extra_packages + tasks: + dnf: + - name: "{{ appliances_extra_packages }}" + when: appliances_mode != 'configure' or appliances_packages_during_configure + - hosts: - selinux - update diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 2a88f035d..c6bf8564b 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -79,4 +79,23 @@ appliances_local_users_default: appliances_local_users_extra: [] # see format of appliances_local_users_default above appliances_local_users: "{{ appliances_local_users_default + appliances_local_users_extra }}" -########################################################################################### +################## bootstrap: extra package installs ###################################### + +appliances_default_extra_packages: + - htop + - nano + - screen + - tmux + - wget + - bind-utils + - net-tools + - postfix + - git + - "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}" + + +appliances_packages_during_configure: false + +appliances_other_extra_packages: [] + +appliances_extra_packages: "{{ appliances_default_extra_packages + appliances_other_extra_packages }}" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 9b9aa5bf0..d8ad503fe 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -144,3 +144,8 @@ freeipa_client [lustre] # Hosts to run lustre client + +[extra_packages:children] +# Hosts to install specified additional packages on +cluster +builder From 187bc40b898b42f1cb67fea653687cb6c4499dd3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 13:13:02 +0000 Subject: [PATCH 054/182] added pulp sync adhoc and temporarily moved out of ci --- ansible/adhoc/sync-pulp.yml | 11 +++++++++++ ansible/roles/pulp_site/tasks/sync.yml | 5 +++++ environments/.stackhpc/inventory/extra_groups | 2 ++ 3 files changed, 18 insertions(+) create mode 100644 ansible/adhoc/sync-pulp.yml diff --git a/ansible/adhoc/sync-pulp.yml b/ansible/adhoc/sync-pulp.yml new file mode 100644 index 000000000..9c7684445 --- /dev/null +++ b/ansible/adhoc/sync-pulp.yml @@ -0,0 +1,11 @@ +- hosts: localhost + tasks: + - ansible.builtin.include_role: + name: pulp_site + tasks_from: sync.yml + vars: + pulp_site_target_arch: "x86_64" + pulp_site_target_distribution: "rocky" + pulp_site_target_distribution_version: "9.4" + pulp_site_target_distribution_version_major: "9" + pulp_site_version_timestamps: "{{ appliances_repo_timestamps[pulp_site_target_distribution_version] }}" diff --git a/ansible/roles/pulp_site/tasks/sync.yml b/ansible/roles/pulp_site/tasks/sync.yml index 62395f0f3..5ef2bc5f1 100644 --- a/ansible/roles/pulp_site/tasks/sync.yml +++ b/ansible/roles/pulp_site/tasks/sync.yml @@ -1,5 +1,10 @@ --- +- ansible.builtin.assert: + that: pulp_site_upstream_password != '' + quiet: true + fail_msg: "Upstream password not set. Either set env var ARK_PASSWORD or override pulp_site_upstream_password." + - name: Wait for Pulp server pulp.squeezer.status: pulp_url: "{{ pulp_site_url }}" diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index 7c9a7c774..c2002c59f 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -31,3 +31,5 @@ compute [squid:children] # Install squid into fat image builder + +[pulp:children] From 580b0b3b943af7272c85f46950a8e3382cdbca34 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 13:28:39 +0000 Subject: [PATCH 055/182] fixed disabling for ci --- ansible/bootstrap.yml | 4 +--- environments/.stackhpc/inventory/extra_groups | 2 -- environments/.stackhpc/inventory/group_vars/builder.yml | 1 + environments/common/inventory/group_vars/all/defaults.yml | 1 + 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index dfe212d02..8c46c5e24 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -113,14 +113,12 @@ - name: Sync pulp repos with upstream hosts: pulp tasks: - - debug: - var: hostvars[groups['builder'][0]]['ansible_facts'] - ansible.builtin.include_role: name: pulp_site tasks_from: sync.yml apply: delegate_to: localhost - when: appliances_mode != 'configure' + when: appliances_mode != 'configure' and appliances_sync_pulp_on_build - hosts: dnf_repos become: yes diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index c2002c59f..7c9a7c774 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -31,5 +31,3 @@ compute [squid:children] # Install squid into fat image builder - -[pulp:children] diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 0fd19e1f9..c4b01b03f 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -6,3 +6,4 @@ pulp_server_config: appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" +appliances_sync_pulp_on_build: false diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 9d8a7ab33..f2a6723ad 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -82,6 +82,7 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us ########################################################################################### +appliances_sync_pulp_on_build: true appliances_repo_timestamps: '9.4': baseos: 20240816T002610 From 2ed66742bb9b665d879af8c0b5e6e6aa6d434163 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 13:39:54 +0000 Subject: [PATCH 056/182] made dnf epel repo more configurable --- ansible/roles/dnf_repos/defaults/main.yml | 1 + ansible/roles/dnf_repos/tasks/set_repos.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 0a09e5f3a..d4c80b0c9 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -21,3 +21,4 @@ dnf_repos_repolist: base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ dnf_repos_version_timestamps.extras }}" dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_version }}/{{ dnf_repos_version_timestamps.epel }}" +dnf_repos_epel_description: "epel" diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index 8a8364097..dea803902 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -17,6 +17,6 @@ ansible.builtin.yum_repository: name: epel file: epel - description: epel + description: "{{ dnf_repos_epel_description }}" gpgcheck: false baseurl: "{{ dnf_repos_epel_baseurl }}" From efd2883211fbe9563568ff13a2c2759ef9ef31a3 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:40:41 +0000 Subject: [PATCH 057/182] Add role to install NVIDIA DOCA on top of an existing "fat" image (#492) * add doca role run by fatimage * add workflow to test doca build * make packer inventory groups clearer and allow defining no extra * update packer workflows for new packer config * define builds entirely via matrix * WIP: do DOCA CI build on top of current fat image * fixup matrix for changes * fix doca workflow typo * use current fatimage for doca test build * enable fatimage to be used for volume-backed builds * bump CI image * doca workflow: clean up image and only run on relevant changes * remove commented-out code * add DOCA README * fix DOCA role actually running * tidyup DOCA play * include doca packages in image summary * fix squid being selected for any stackhopc build VM * fix nightly build concurrency * re-add squid back to Stackhpc builder group * remove debugging exit * update image build docs * update packer docs --- .github/workflows/doca.yml | 132 ++++++++++++++++++ .github/workflows/fatimage.yml | 38 +++-- .github/workflows/nightlybuild.yml | 54 +++---- ansible/.gitignore | 2 + ansible/cleanup.yml | 5 + ansible/fatimage.yml | 11 ++ ansible/roles/doca/README.md | 12 ++ ansible/roles/doca/defaults/main.yml | 3 + .../roles/doca/tasks/install-kernel-devel.yml | 24 ++++ ansible/roles/doca/tasks/install.yml | 53 +++++++ ansible/roles/doca/tasks/main.yml | 1 + docs/image-build.md | 74 +++------- .../terraform/cluster_image.auto.tfvars.json | 4 +- packer/openstack.pkr.hcl | 65 +++------ 14 files changed, 323 insertions(+), 155 deletions(-) create mode 100644 .github/workflows/doca.yml create mode 100644 ansible/roles/doca/README.md create mode 100644 ansible/roles/doca/defaults/main.yml create mode 100644 ansible/roles/doca/tasks/install-kernel-devel.yml create mode 100644 ansible/roles/doca/tasks/install.yml create mode 100644 ansible/roles/doca/tasks/main.yml diff --git a/.github/workflows/doca.yml b/.github/workflows/doca.yml new file mode 100644 index 000000000..cfd3bb982 --- /dev/null +++ b/.github/workflows/doca.yml @@ -0,0 +1,132 @@ +name: Test DOCA extra build +on: + workflow_dispatch: + push: + branches: + - main + paths: + - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' + - '.github/workflows/doca' + pull_request: + paths: + - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' + - '.github/workflows/doca' + +jobs: + doca: + name: doca-build + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS + cancel-in-progress: true + runs-on: ubuntu-22.04 + strategy: + fail-fast: false # allow other matrix jobs to continue even if one fails + matrix: # build RL8, RL9 + build: + - image_name: openhpc-doca-RL8 + source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json + inventory_groups: doca + - image_name: openhpc-doca-RL9 + source_image_name_key: RL9 + inventory_groups: doca + env: + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack + CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} + + steps: + - uses: actions/checkout@v2 + + - name: Load current fat images into GITHUB_ENV + # see https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#example-of-a-multiline-string + run: | + { + echo 'FAT_IMAGES<> "$GITHUB_ENV" + + - name: Record settings + run: | + echo CI_CLOUD: ${{ env.CI_CLOUD }} + echo FAT_IMAGES: ${FAT_IMAGES} + + - name: Setup ssh + run: | + set -x + mkdir ~/.ssh + echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + shell: bash + + - name: Add bastion's ssh key to known_hosts + run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts + shell: bash + + - name: Install ansible etc + run: dev/setup-env.sh + + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml + shell: bash + + - name: Setup environment + run: | + . venv/bin/activate + . environments/.stackhpc/activate + + - name: Build fat image with packer + id: packer_build + run: | + set -x + . venv/bin/activate + . environments/.stackhpc/activate + cd packer/ + packer init . + + PACKER_LOG=1 packer build \ + -on-error=${{ vars.PACKER_ON_ERROR }} \ + -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ + -var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ + openstack.pkr.hcl + + - name: Get created image names from manifest + id: manifest + run: | + . venv/bin/activate + IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json) + while ! openstack image show -f value -c name $IMAGE_ID; do + sleep 5 + done + IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" + echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" + echo $IMAGE_ID > image-id.txt + echo $IMAGE_NAME > image-name.txt + + - name: Make image usable for further builds + run: | + . venv/bin/activate + openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" + + - name: Delete image for automatically-run workflows + run: | + . venv/bin/activate + openstack image delete "${{ steps.manifest.outputs.image-id }}" + if: ${{ github.event_name != 'workflow_dispatch' }} + + - name: Upload manifest artifact + uses: actions/upload-artifact@v4 + with: + name: image-details-${{ matrix.build.image_name }} + path: | + ./image-id.txt + ./image-name.txt + overwrite: true diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 217b09c22..da933c91d 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -15,30 +15,23 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 - os_version: - - RL8 - - RL9 build: - - openstack.openhpc + - image_name: openhpc-RL8 + source_image_name: rocky-latest-RL8 + inventory_groups: control,compute,login + - image_name: openhpc-RL9 + source_image_name: rocky-latest-RL9 + inventory_groups: control,compute,login env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud }} - SOURCE_IMAGES_MAP: | - { - "RL8": { - "openstack.openhpc": "rocky-latest-RL8" - }, - "RL9": { - "openstack.openhpc": "rocky-latest-RL9" - } - } ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: @@ -85,13 +78,11 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "source_image_name=${{ matrix.build.source_image_name }}" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl - env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }} - name: Get created image names from manifest id: manifest @@ -102,13 +93,20 @@ jobs: sleep 5 done IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" + echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" echo $IMAGE_ID > image-id.txt echo $IMAGE_NAME > image-name.txt + - name: Make image usable for further builds + run: | + . venv/bin/activate + openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" + - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: - name: image-details-${{ matrix.build }}-${{ matrix.os_version }} + name: image-details-${{ matrix.build.image_name }} path: | ./image-id.txt ./image-name.txt diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 9f45b0890..a0e78cd0b 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -11,32 +11,29 @@ on: - SMS - ARCUS schedule: - - cron: '0 0 * * *' # Run at midnight + - cron: '0 0 * * *' # Run at midnight on default branch jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 - os_version: - - RL8 - - RL9 build: - - openstack.rocky-latest + - image_name: rocky-latest-RL8 + source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + inventory_groups: update + - image_name: rocky-latest-RL9 + source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: update env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} - SOURCE_IMAGES_MAP: | - { - "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2", - "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" - } ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: @@ -83,15 +80,12 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "source_image_name=${{ matrix.build.source_image_name }}" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl - env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version] }} - - name: Get created image names from manifest id: manifest run: | @@ -125,7 +119,7 @@ jobs: name: upload-nightly-targets needs: openstack concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.image }}-${{ matrix.target_cloud }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }}-${{ matrix.target_cloud }} cancel-in-progress: true runs-on: ubuntu-22.04 strategy: @@ -135,18 +129,15 @@ jobs: - LEAFCLOUD - SMS - ARCUS - os_version: - - RL8 - - RL9 - image: - - rocky-latest + build: + - image_name: rocky-latest-RL8 + - image_name: rocky-latest-RL9 exclude: - target_cloud: LEAFCLOUD env: OS_CLOUD: openstack SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} TARGET_CLOUD: ${{ matrix.target_cloud }} - IMAGE_NAME: "${{ matrix.image }}-${{ matrix.os_version }}" steps: - uses: actions/checkout@v2 @@ -161,42 +152,37 @@ jobs: . venv/bin/activate pip install -U pip pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) - shell: bash - name: Write clouds.yaml run: | mkdir -p ~/.config/openstack/ echo "${{ secrets[format('{0}_CLOUDS_YAML', env.SOURCE_CLOUD)] }}" > ~/.config/openstack/source_clouds.yaml echo "${{ secrets[format('{0}_CLOUDS_YAML', env.TARGET_CLOUD)] }}" > ~/.config/openstack/target_clouds.yaml - shell: bash - name: Download source image run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/source_clouds.yaml - openstack image save --file ${{ env.IMAGE_NAME }} ${{ env.IMAGE_NAME }} - shell: bash + openstack image save --file ${{ matrix.build.image_name }} ${{ matrix.build.image_name }} - name: Upload to target cloud run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - openstack image create "${{ env.IMAGE_NAME }}" \ - --file "${{ env.IMAGE_NAME }}" \ + openstack image create "${{ matrix.build.image_name }}" \ + --file "${{ matrix.build.image_name }}" \ --disk-format qcow2 \ - shell: bash - name: Delete old latest image from target cloud run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - IMAGE_COUNT=$(openstack image list --name ${{ env.IMAGE_NAME }} -f value -c ID | wc -l) + IMAGE_COUNT=$(openstack image list --name ${{ matrix.build.image_name }} -f value -c ID | wc -l) if [ "$IMAGE_COUNT" -gt 1 ]; then - OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ env.IMAGE_NAME }}" -f value -c ID | head -n 1) + OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ matrix.build.image_name }}" -f value -c ID | head -n 1) openstack image delete "$OLD_IMAGE_ID" else echo "Only one image exists, skipping deletion." fi - shell: bash diff --git a/ansible/.gitignore b/ansible/.gitignore index 48c917c4f..3fef64ecc 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -66,3 +66,5 @@ roles/* !roles/lustre/** !roles/dnf_repos/ !roles/dnf_repos/** +!roles/doca/ +!roles/doca/** diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index cf9b0bdab..3f059d157 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -61,5 +61,10 @@ os: "{{ ansible_distribution }} {{ ansible_distribution_version }}" kernel: "{{ ansible_kernel }}" ofed: "{{ ansible_facts.packages['mlnx-ofa_kernel'].0.version | default('-') }}" + doca: "{{ ansible_facts.packages[doca_profile | default('doca-ofed') ].0.version | default('-') }}" cuda: "{{ ansible_facts.packages['cuda'].0.version | default('-') }}" slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}" + +- name: Show image summary + debug: + var: image_info diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index b28e4f308..439c50e70 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -6,6 +6,9 @@ tasks: - name: Report hostname (= final image name) command: hostname + - name: Report inventory groups + debug: + var: group_names - name: Run pre.yml hook vars: @@ -199,6 +202,14 @@ name: cloudalchemy.grafana tasks_from: install.yml +- hosts: doca + become: yes + gather_facts: yes + tasks: + - name: Install NVIDIA DOCA + import_role: + name: doca + - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/roles/doca/README.md b/ansible/roles/doca/README.md new file mode 100644 index 000000000..5f898add5 --- /dev/null +++ b/ansible/roles/doca/README.md @@ -0,0 +1,12 @@ +# doca + +Install [NVIDIA DOCA](https://docs.nvidia.com/doca/sdk/index.html). + +This role is not idempotent and is only intended to be run during an image build. It builds DOCA kernel modules to match the installed kernel and then installs these +plus the selected DOCA packages. + +## Role Variables + +- `doca_version`: Optional. String giving doca version. +- `doca_profile`: Optional. Name of [profile](https://docs.nvidia.com/doca/sdk/nvidia+doca+profiles/index.html) defining subset of DOCA to install. Default is `doca-ofed`. +- `doca_repo_url`: Optional. URL of DOCA repository. Default is appropriate upstream public repository for DOCA version, distro version and architecture. diff --git a/ansible/roles/doca/defaults/main.yml b/ansible/roles/doca/defaults/main.yml new file mode 100644 index 000000000..66437cd04 --- /dev/null +++ b/ansible/roles/doca/defaults/main.yml @@ -0,0 +1,3 @@ +doca_version: '2.9.1' # 2.9 is LTS, last to support ConnectX-4, 3 years for bug fixes and CVE updates +doca_profile: doca-ofed +doca_repo_url: "https://linux.mellanox.com/public/repo/doca/{{ doca_version }}/rhel{{ ansible_distribution_version }}/{{ ansible_architecture }}/" diff --git a/ansible/roles/doca/tasks/install-kernel-devel.yml b/ansible/roles/doca/tasks/install-kernel-devel.yml new file mode 100644 index 000000000..6a1943a32 --- /dev/null +++ b/ansible/roles/doca/tasks/install-kernel-devel.yml @@ -0,0 +1,24 @@ +- name: Get installed kernels + command: dnf list --installed kernel + register: _ofed_dnf_kernels + changed_when: false + +- name: Determine running kernel + command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64 + register: _ofed_loaded_kernel + changed_when: false + +- name: Check current kernel is newest installed + assert: + that: _ofed_kernel_current == _ofed_dnf_kernels_newest + fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" + vars: + _ofed_kernel_current: >- + {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} + _ofed_dnf_kernels_newest: >- + {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} + # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " + +- name: Install matching kernel-devel package + dnf: + name: "kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }}" diff --git a/ansible/roles/doca/tasks/install.yml b/ansible/roles/doca/tasks/install.yml new file mode 100644 index 000000000..9d297e946 --- /dev/null +++ b/ansible/roles/doca/tasks/install.yml @@ -0,0 +1,53 @@ +- import_tasks: install-kernel-devel.yml + +- name: Install DOCA repo + ansible.builtin.yum_repository: + name: doca + file: doca + description: DOCA Online Repo + baseurl: "{{ doca_repo_url }}" + enabled: true + gpgcheck: false + +- name: Install doca-extra package + ansible.builtin.dnf: + name: doca-extra + +- name: Build DOCA kernel modules + ansible.builtin.shell: + cmd: /opt/mellanox/doca/tools/doca-kernel-support + register: _doca_kernel_build + + +- name: Find generated doca-kernel-repo + ansible.builtin.shell: 'find /tmp/DOCA.* -name doca-kernel-repo-*' + register: _doca_kernel_repo # e.g. /tmp/DOCA.WVMchs2QWo/doca-kernel-repo-24.10.1.1.4.0-1.kver.5.14.0.427.31.1.el9.4.x86.64.x86_64.rpm + changed_when: false + +- name: Create dnf cache + ansible.builtin.command: dnf makecache + +- name: Install DOCA repository package + ansible.builtin.dnf: + name: "{{ _doca_kernel_repo.stdout }}" + disable_gpg_check: true + +- name: Install DOCA packages + ansible.builtin.dnf: + name: "{{ doca_profile }}" + +- name: Cleanup DOCA build directories + ansible.builtin.file: + state: absent + path: "{{ (_doca_kernel_repo.stdout | split('/'))[:2] | join('/') }}" + +- name: Update initramfs + ansible.builtin.command: + cmd: dracut -f --tmpdir /var/tmp + environment: + TMPDIR: /var/tmp + register: _doca_dracut + failed_when: _doca_dracut.stderr != '' # appears rc is always 0 + +- name: Load the new driver + ansible.builtin.command: /etc/init.d/openibd restart diff --git a/ansible/roles/doca/tasks/main.yml b/ansible/roles/doca/tasks/main.yml new file mode 100644 index 000000000..e7a272f38 --- /dev/null +++ b/ansible/roles/doca/tasks/main.yml @@ -0,0 +1 @@ +- include_tasks: install.yml diff --git a/docs/image-build.md b/docs/image-build.md index 4896bde57..a7d2e951b 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -2,87 +2,57 @@ The appliance contains code and configuration to use [Packer](https://developer.hashicorp.com/packer) with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. -The Packer configuration defined here builds "fat images" which contain binaries for all nodes, but no cluster-specific configuration. Using these: +The Packer configuration defined here builds "fat images" which contain packages, binaries and container images but no cluster-specific configuration. Using these: - Enables the image to be tested in CI before production use. - Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). - Improves deployment speed by reducing the number of package downloads to improve deployment speed. -By default, a fat image build starts from a nightly image build containing Mellanox OFED, and updates all DNF packages already present. The 'latest' nightly build itself is from a RockyLinux GenericCloud image. - -The fat images StackHPC builds and test in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: +The fat images StackHPC builds and tests in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: 1. Build site-specific fat images from scratch. -2. Extend an existing fat image with additional software. +2. Extend an existing fat image with additional functionality. # Usage -The steps for building site-specific fat images or extending an existing fat image are the same: +To build either a site-specific fat image from scratch, or to extend an existing StackHPC fat image: 1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). -2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum e.g.: +2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum: ```hcl flavor = "general.v1.small" # VM flavor to use for builder VMs networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to + source_image_name = "Rocky-9-GenericCloud-Base-9.4" # Name of image to create VM with, i.e. starting image + inventory_groups = "control,login,compute" # Additional inventory groups to add build VM to + ``` + Note that: - - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). - - For additional options such as non-default private key locations or jumphost configuration see the variable descriptions in `./openstack.pkr.hcl`. - - For an example of configuration for extending an existing fat image see below. + - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). + - The flavor used must have sufficent memory for the build tasks, but otherwise does not need to match the final cluster nodes. Usually 8GB is sufficent. By default, the build VM is volume-backed to allow control of the root disk size (and hence final image size) so the flavor disk size does not matter. + - The source image should be either a RockyLinux GenericCloud image for a site-specific image build from scratch, or a StackHPC fat image if extending an existing image. + - The `inventory_groups` variable takes a comma-separated list of Ansible inventory groups to add the build VM to. This is in addition to the `builder` group which it is always added to. This controls which Ansible roles and functionality run during build, and hence what gets added to the image. All possible groups are listed in `environments/common/groups` but common options for this variable will be: + - `update,control,login,compute`: The resultant image has all packages in the source image updated, and then packages for all types of nodes in the cluster are added. When using a GenericCloud image for `source_image_name` this builds a site-specific fat image from scratch. + - One or more specific groups which are not enabled in the appliance by default, e.g. `lustre`. When using a StackHPC fat image for `source_image_name` this extends the image with just this additional functionality. 3. Activate the venv and the relevant environment. 4. Build images using the relevant variable definition file, e.g.: cd packer/ - PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - - Note that the `-only` flag here restricts Packer to a single specific "build" definition (in Packer terminology). Options here are: - - `-only=openstack.openhpc`: Build a fat image including Mellanox OFED - - `-only=openstack.openhpc-cuda`: Build a fat image including Mellanox OFED, Nvidia drivers and CUDA - - `-only=openstack.openhpc-extra`: Build an image which *extends* an existing fat image - -5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash. - -# Defining an "extra" image build - -An "extra" image build starts with an existing fat image (e.g. one provided by StackHPC) rather than a RockyLinux GenericCloud image, and only runs a specific subset of the -Ansible in the appliance. This allows adding additional functionality into site-specific images, without modifying the existing functionality in the base fat image. This is the recommended way to build site-specific images. - -To configure an "extra" image build, prepare a Packer variable definition file as described above but also including: - -- `extra_build_image_name`: A string to add into the final image name. -- `source_image` or `source_image_name`: The UUID or name of the fat image to start from (which must already be present in OpenStack). -- `extra_build_groups`: A list of Ansible inventory groups to put the build VM into, in addition to the `builder` group. This defines the roles/functionality - which are added to the image. -- `extra_build_volume_size`: A number giving the size in GB of the volume for the build VM's root disk and therefore the resulting image size. - Note this assumes the default of `use_blockstorage_volume = true`. - -E.g. to add the lustre client to an RockyLinux 9 image: - - # environments/site/lustre.pkvars.hcl - - extra_build_image_name = "lustre" # output image name will be like "openhpc-lustre-RL9-$timestamp-$commit" - source_image_name = "openhpc-ofed-RL9-240906-1041-32568dbb" # e.g. current StackHPC RL9 image - extra_build_groups = ["lustre"] # only run lustre role during this extra build - extra_build_volume_size = 15 # default non-CUDA build image size has enough free space - - # ... define flavor, network, etc as normal - - -Then, reference this build and variables file in the Packer build command: + PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc-extra --on-error=ask -var-file=environments/site/lustre.pkvars.hcl openstack.pkr.hcl + **NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: -**NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: + openstack image show $SOURCE_IMAGE - openstack image show $SOURCE_IMAGE + If it does, remove this property: -If it does, remove this property: + openstack image unset --property signature_verified $SOURCE_IMAGE - openstack image unset --property signature_verified $SOURCE_IMAGE + then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). -then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). +5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. # Build Process diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 14c997596..5b9d845ef 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241203-1659-b0558b95", - "RL9": "openhpc-RL9-241203-1659-b0558b95" + "RL8": "openhpc-RL8-241211-1322-ded60c2c", + "RL9": "openhpc-RL9-241211-1322-ded60c2c" } } diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 52202ead1..2ba0a1e63 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -23,6 +23,7 @@ data "git-commit" "cwd-head" { } locals { git_commit = data.git-commit.cwd-head.hash timestamp = formatdate("YYMMDD-hhmm", timestamp()) + image_name_version = var.image_name_version == "auto" ? "-${local.timestamp}-${substr(local.git_commit, 0, 8)}" : var.image_name_version } # Path pointing to root of repository - automatically set by environment variable PKR_VAR_repo_root @@ -39,12 +40,6 @@ variable "networks" { type = list(string) } -variable "os_version" { - type = string - description = "'RL8' or 'RL9' with default source_image_* mappings" - default = "RL9" -} - # Must supply either source_image_name or source_image_id variable "source_image_name" { type = string @@ -123,15 +118,6 @@ variable "volume_type" { } variable "volume_size" { - type = map(number) - default = { - # fat image builds, GB: - rocky-latest = 15 - openhpc = 15 - } -} - -variable "extra_build_volume_size" { type = number default = 15 } @@ -146,25 +132,22 @@ variable "metadata" { default = {} } -variable "groups" { - type = map(list(string)) - description = "Additional inventory groups (other than 'builder') to add build VM to, keyed by source name" - default = { - # fat image builds: - rocky-latest = ["update"] - openhpc = ["control", "compute", "login"] - } +variable "inventory_groups" { + type = string + description = "Comma-separated list of additional inventory groups (other than 'builder') to add build VM to. Default is none." + default = "" } -variable "extra_build_groups" { - type = list(string) - default = [] +variable "image_name" { + type = string + description = "Name of image" + default = "openhpc" } -variable "extra_build_image_name" { +variable "image_name_version" { type = string - description = "Infix for 'extra' build image name" - default = "extra" + description = "Suffix for image name giving version. Default of 'auto' appends timestamp + short commit" + default = "auto" } source "openstack" "openhpc" { @@ -172,9 +155,11 @@ source "openstack" "openhpc" { flavor = var.flavor use_blockstorage_volume = var.use_blockstorage_volume volume_type = var.volume_type - volume_size = lookup(var.volume_size, source.name, var.extra_build_volume_size) + volume_size = var.volume_size metadata = var.metadata - instance_metadata = {ansible_init_disable = "true"} + instance_metadata = { + ansible_init_disable = "true" + } networks = var.networks floating_ip_network = var.floating_ip_network security_groups = var.security_groups @@ -200,27 +185,13 @@ source "openstack" "openhpc" { build { - # latest nightly image: - source "source.openstack.openhpc" { - name = "rocky-latest" - image_name = "${source.name}-${var.os_version}" - } - - # fat image: - source "source.openstack.openhpc" { - name = "openhpc" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" - } - - # Extended site-specific image, built on fat image: source "source.openstack.openhpc" { - name = "openhpc-extra" - image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" + image_name = "${var.image_name}${local.image_name_version}" } provisioner "ansible" { playbook_file = "${var.repo_root}/ansible/fatimage.yml" - groups = concat(["builder"], lookup(var.groups, source.name, var.extra_build_groups)) + groups = concat(["builder"], var.inventory_groups == "" ? [] : split(",", var.inventory_groups)) keep_inventory_file = true # for debugging use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting extra_arguments = [ From d12083a6953499ae8c116660ec45aca0183239b0 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 14:10:27 +0000 Subject: [PATCH 058/182] moved repo enable/disable into fatimage --- ansible/bootstrap.yml | 19 ------------------- ansible/fatimage.yml | 28 ++++++++++++++++++++++++++++ ansible/site.yml | 9 --------- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 8c46c5e24..733d4b3f8 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -110,25 +110,6 @@ policy: "{{ selinux_policy }}" register: sestatus -- name: Sync pulp repos with upstream - hosts: pulp - tasks: - - ansible.builtin.include_role: - name: pulp_site - tasks_from: sync.yml - apply: - delegate_to: localhost - when: appliances_mode != 'configure' and appliances_sync_pulp_on_build - -- hosts: dnf_repos - become: yes - tasks: - - name: Replace system repos with pulp repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: set_repos.yml - when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided - # --- tasks after here require access to package repos --- - hosts: squid tags: squid diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index b28e4f308..7c83fc2a2 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -14,6 +14,25 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists +- name: Sync pulp repos with upstream + hosts: pulp + tasks: + - ansible.builtin.include_role: + name: pulp_site + tasks_from: sync.yml + apply: + delegate_to: localhost + when: appliances_mode != 'configure' and appliances_sync_pulp_on_build + +- hosts: dnf_repos + become: yes + tasks: + - name: Replace system repos with pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: set_repos.yml + when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided + - import_playbook: bootstrap.yml - name: Run post-bootstrap.yml hook @@ -199,6 +218,15 @@ name: cloudalchemy.grafana tasks_from: install.yml +- hosts: dnf_repos + become: yes + tasks: + - name: Disable pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: disable_repos.yml + when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided + - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/site.yml b/ansible/site.yml index a09d5a510..bb379399d 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -28,15 +28,6 @@ - import_playbook: portal.yml - import_playbook: monitoring.yml -- hosts: dnf_repos - become: yes - tasks: - - name: Disable pulp repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: disable_repos.yml - when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided - - name: Run post.yml hook vars: # hostvars not available here, so have to recalculate environment root: From 07dc9b796ff32002e83cae21b29ded39d688a750 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 14:28:31 +0000 Subject: [PATCH 059/182] fixed disable repos task --- ansible/roles/dnf_repos/tasks/disable_repos.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml index 69aed3b6b..53459ce49 100644 --- a/ansible/roles/dnf_repos/tasks/disable_repos.yml +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -1,6 +1,6 @@ --- - name: Disable Pulp repos and remove creds ansible.builtin.yum: - disablerepo: "{{ item.name }}" - loop: "{{ dnf_repos_repolist + [epel] }}" + disablerepo: "{{ item }}" + loop: "{{ dnf_repos_repolist | map(attribute='name') + ['epel'] }}" when: dnf_repos_disable From 3088f8375dcd5e7b4bb98b7dab008f59f36fda1c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 15:28:08 +0000 Subject: [PATCH 060/182] reverted disable repos task --- .../roles/dnf_repos/tasks/disable_repos.yml | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml index 53459ce49..2dbacc262 100644 --- a/ansible/roles/dnf_repos/tasks/disable_repos.yml +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -1,6 +1,18 @@ --- -- name: Disable Pulp repos and remove creds - ansible.builtin.yum: - disablerepo: "{{ item }}" - loop: "{{ dnf_repos_repolist | map(attribute='name') + ['epel'] }}" - when: dnf_repos_disable +- name: Disable Pulp repos + ansible.builtin.yum_repository: + file: "{{ item.file }}" + name: "{{ item.name }}" + baseurl: "{{ item.base_url }}" + description: "{{ item.name }}" + enabled: false + loop: "{{ dnf_repos_repolist }}" + +- name: Disable EPEL repo + ansible.builtin.yum_repository: + name: epel + file: epel + description: "{{ dnf_repos_epel_description }}" + baseurl: "{{ dnf_repos_epel_baseurl }}" + gpgcheck: false + enabled: false From c74360bf325c615b11db342a367538da5467cc1d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 15:50:24 +0000 Subject: [PATCH 061/182] fatimage with test latest (REVERT LATER) --- .github/workflows/fatimage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 331035001..d368d86cd 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -26,7 +26,7 @@ jobs: source_image_name: rocky-latest-RL8 inventory_groups: control,compute,login - image_name: openhpc-RL9 - source_image_name: rocky-latest-RL9 + source_image_name: rocky-latest-RL9-241212-1532-3088f837 inventory_groups: control,compute,login env: ANSIBLE_FORCE_COLOR: True From 67ce24bc5dda26b1ab2539e2627f6dfb59eb1b3b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 16:36:45 +0000 Subject: [PATCH 062/182] refactored pulp deploy and added pulp docs --- README.md | 2 +- ansible/adhoc/deploy-pulp.yml | 13 ++++++------- docs/experimental/pulp.md | 17 +++++++++++++++++ .../common/inventory/group_vars/all/pulp.yml | 1 + 4 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 docs/experimental/pulp.md create mode 100644 environments/common/inventory/group_vars/all/pulp.yml diff --git a/README.md b/README.md index f61bf8df4..f66441915 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ It requires an OpenStack cloud, and an Ansible "deploy host" with access to that Before starting ensure that: - You have root access on the deploy host. - You can create instances using a Rocky 9 GenericCloud image (or an image based on that). - - **NB**: In general it is recommended to use the [latest released image](https://github.com/stackhpc/ansible-slurm-appliance/releases) which already contains the required packages. This is built and tested in StackHPC's CI. However the appliance will install the necessary packages if a GenericCloud image is used. + - **NB**: In general it is recommended to use the [latest released image](https://github.com/stackhpc/ansible-slurm-appliance/releases) which already contains the required packages. This is built and tested in StackHPC's CI. - You have a SSH keypair defined in OpenStack, with the private key available on the deploy host. - Created instances have access to internet (note proxies can be setup through the appliance if necessary). - Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance). diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml index 89c51922a..38cb79289 100644 --- a/ansible/adhoc/deploy-pulp.yml +++ b/ansible/adhoc/deploy-pulp.yml @@ -18,11 +18,10 @@ tasks_from: install.yml public: true - - name: Add pulp host to environment + - name: Print Pulp endpoint become: no - delegate_to: localhost - ansible.builtin.copy: - dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/inventory/group_vars/all/pulp_server.yml" - content: | - # ansible managed - appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" + debug: + msg: | + Server configured, override 'appliances_pulp_url' with + appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" + in your environments diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md new file mode 100644 index 000000000..974803030 --- /dev/null +++ b/docs/experimental/pulp.md @@ -0,0 +1,17 @@ +# Pulp Server + +In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's Ark Pulp server. The appliance will sync relevant repositories to local Pulp server which will be used for image builds. Using a local server can be enabled by adding `pulp` to the build groups and overriding `dnf_repos_repolist` to point at content hosted on the local server. + +## Deploying/configuring Pulp Server + +### Deploying a Pulp server +A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml' playbook. This can be run with +`ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server="` +This will print a Pulp endpoint which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note that this server's content isn't authenticated so assumes the server is deployed behind a secure network. + +### Using an existing Pulp server +An existing Pulp server can be used to host Ark repos by overriding `pulp_site_password` and `appliances_pulp_url` in the target environment. Note that this assumes the same configuration as the appliance deployed pulp i.e no content authentication. + +## Syncing Pulp content with Ark + +By default, the appliance will sync repos for the targetted distribution during build (can be disabled by setting `appliances_sync_pulp_on_build` to `false`). You must supply your Ark credentials, either by overriding `pulp_site_upstream_password` or setting environment variable `ARK_PASSWORD`. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`, optionally setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version`. diff --git a/environments/common/inventory/group_vars/all/pulp.yml b/environments/common/inventory/group_vars/all/pulp.yml new file mode 100644 index 000000000..02b7aa816 --- /dev/null +++ b/environments/common/inventory/group_vars/all/pulp.yml @@ -0,0 +1 @@ +pulp_site_port: 8080 From c4336055ef0f641d2f210ca8e1c345e28ec7ed4d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 12 Dec 2024 16:41:24 +0000 Subject: [PATCH 063/182] testing image using site pulp --- .github/workflows/fatimage.yml | 2 +- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index d368d86cd..331035001 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -26,7 +26,7 @@ jobs: source_image_name: rocky-latest-RL8 inventory_groups: control,compute,login - image_name: openhpc-RL9 - source_image_name: rocky-latest-RL9-241212-1532-3088f837 + source_image_name: rocky-latest-RL9 inventory_groups: control,compute,login env: ANSIBLE_FORCE_COLOR: True diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 5e71beebd..5c100f999 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241206-1541-eaa36805", - "RL9": "openhpc-RL9-241206-1541-eaa36805" + "RL8": "openhpc-RL8-241212-1553-c74360bf", + "RL9": "openhpc-RL9-241212-1554-c74360bf" } } From bda3f7e7568648d03bb50ede8f11ded5e933f0cb Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Dec 2024 10:12:40 +0000 Subject: [PATCH 064/182] Pointed dnf repos back at ark for now + refactor --- ansible/adhoc/sync-pulp.yml | 1 - ansible/fatimage.yml | 2 +- ansible/roles/dnf_repos/defaults/main.yml | 15 ++++++------- ansible/roles/dnf_repos/tasks/set_repos.yml | 4 ++++ ansible/roles/pulp_site/defaults/main.yml | 5 +++-- docs/experimental/pulp.md | 2 +- .../inventory/group_vars/builder.yml | 21 ++++++++++++------- .../inventory/group_vars/all/defaults.yml | 8 ++++--- environments/common/inventory/groups | 2 -- 9 files changed, 35 insertions(+), 25 deletions(-) diff --git a/ansible/adhoc/sync-pulp.yml b/ansible/adhoc/sync-pulp.yml index 9c7684445..f26149bba 100644 --- a/ansible/adhoc/sync-pulp.yml +++ b/ansible/adhoc/sync-pulp.yml @@ -8,4 +8,3 @@ pulp_site_target_distribution: "rocky" pulp_site_target_distribution_version: "9.4" pulp_site_target_distribution_version_major: "9" - pulp_site_version_timestamps: "{{ appliances_repo_timestamps[pulp_site_target_distribution_version] }}" diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index eaa5215a5..5d84fcf90 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -25,7 +25,7 @@ tasks_from: sync.yml apply: delegate_to: localhost - when: appliances_mode != 'configure' and appliances_sync_pulp_on_build + when: appliances_mode != 'configure' - hosts: dnf_repos become: yes diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index d4c80b0c9..19a5d4986 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -2,23 +2,24 @@ dnf_repos_pulp_url: "{{ appliances_pulp_url }}" dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" -dnf_repos_disable: true -dnf_repos_version_timestamps: "{{ appliances_repo_timestamps[ansible_distribution_version] }}" +dnf_repos_version_timestamps: "{{ appliances_repo_minor_timestamps[ansible_distribution_version] }}" +dnf_repos_username: "{{ omit }}" +dnf_repos_password: "{{ omit }}" # epel installed separately dnf_repos_repolist: - file: rocky name: baseos - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/baseos/{{ dnf_repos_version_timestamps.baseos }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/BaseOS/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].baseos }}" - file: rocky name: appstream - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/appstream/{{ dnf_repos_version_timestamps.appstream }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/AppStream/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].appstream }}" - file: rocky name: crb - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/crb/{{ dnf_repos_version_timestamps.crb }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/CRB/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].crb }}" - file: rocky-extras name: extras - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ dnf_repos_version_timestamps.extras }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].extras }}" -dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_version }}/{{ dnf_repos_version_timestamps.epel }}" +dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ appliances_repo_major_timestamps[ansible_distribution_major_version].epel }}" dnf_repos_epel_description: "epel" diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index dea803902..fe5e2c02c 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -6,6 +6,8 @@ name: "{{ item.name }}" baseurl: "{{ item.base_url }}" description: "{{ item.name }}" + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" loop: "{{ dnf_repos_repolist }}" - name: Install epel-release @@ -20,3 +22,5 @@ description: "{{ dnf_repos_epel_description }}" gpgcheck: false baseurl: "{{ dnf_repos_epel_baseurl }}" + username: "{{ dnf_repos_username }}" + password: "{{ dnf_repos_password }}" diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index 6a9e98d74..c342ea46f 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -14,7 +14,8 @@ pulp_site_target_arch: "{{ pulp_site_target_facts['architecture'] }}" pulp_site_target_distribution: "{{ pulp_site_target_facts['distribution'] | lower }}" pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" -pulp_site_version_timestamps: "{{ appliances_repo_timestamps[pulp_site_target_distribution_version] }}" +pulp_site_version_timestamps: "{{ appliances_repo_minor_timestamps[pulp_site_target_distribution_version] }}" +pulp_site_major_version_timestamps: "{{ appliances_repo_major_timestamps[pulp_site_target_distribution_version_major] }}" pulp_site_rpm_info: - name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" @@ -30,7 +31,7 @@ pulp_site_rpm_info: url: "{{ pulp_site_default_upstream_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" base_path: "rocky/{{ pulp_site_target_distribution_version }}/extras/{{ pulp_site_version_timestamps.extras }}" - name: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" - url: "https://ark.stackhpc.com/pulp/content/epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_version_timestamps.epel }}" + url: "https://ark.stackhpc.com/pulp/content/epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_major_version_timestamps.epel }}" base_path: "epel/{{ pulp_site_target_distribution_version }}/{{ pulp_site_version_timestamps.epel }}" pulp_site_rpm_repo_defaults: diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index 974803030..d1a40ba52 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -14,4 +14,4 @@ An existing Pulp server can be used to host Ark repos by overriding `pulp_site_p ## Syncing Pulp content with Ark -By default, the appliance will sync repos for the targetted distribution during build (can be disabled by setting `appliances_sync_pulp_on_build` to `false`). You must supply your Ark credentials, either by overriding `pulp_site_upstream_password` or setting environment variable `ARK_PASSWORD`. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`, optionally setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version`. +If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must supply your Ark credentials, either by overriding `pulp_site_upstream_password` or setting environment variable `ARK_PASSWORD`. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`, optionally setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version`. diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index c4b01b03f..ce1666973 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,9 +1,14 @@ # update_enable: false # Can uncomment for speed debugging non-update related build issues -pulp_server_config: - LEAFCLOUD: - url: http://192.168.10.157:8080 - password: lookup('env','LEAFCLOUD_PULP_PASSWORD') - -appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" -pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" -appliances_sync_pulp_on_build: false + +# Uncomment below to use CI pulp servers + +# pulp_server_config: +# LEAFCLOUD: +# url: http://192.168.10.157:8080 +# password: lookup('env','LEAFCLOUD_PULP_PASSWORD') + +# appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" +# pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" + +dnf_repos_username: slurm-app-ci +dnf_repos_password: "{{ lookup('env','ARK_PASSWORD') }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index f2a6723ad..e1acdf19b 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -7,7 +7,7 @@ appliances_environment_name: "{{ appliances_environment_root | basename | regex_ appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it #appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform appliances_mode: configure -#appliances_pulp_url: #override required +appliances_pulp_url: https://ark.stackhpc.com # Address(ip/dns) for internal communication between services. This is # normally traffic you do no want to expose to users. @@ -82,11 +82,13 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us ########################################################################################### -appliances_sync_pulp_on_build: true -appliances_repo_timestamps: +appliances_repo_minor_timestamps: '9.4': baseos: 20240816T002610 appstream: 20240816T002610 crb: 20240816T002610 extras: 20240816T002610 + +appliances_repo_major_timestamps: + '9': epel: 20240902T080424 diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index fbfcfa0ca..8f52477cd 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -147,9 +147,7 @@ freeipa_client [dnf_repos:children] # Hosts to replace system repos with Pulp repos -cluster builder [pulp:children] # Hosts used to run Pulp API commands -builder From 17d79241d7f76287d7208850672b1bed26ca422b Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:15:49 +0000 Subject: [PATCH 065/182] fix doca cleanup deleteing /tmp/ (#494) --- ansible/roles/doca/tasks/install.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ansible/roles/doca/tasks/install.yml b/ansible/roles/doca/tasks/install.yml index 9d297e946..d26fda79e 100644 --- a/ansible/roles/doca/tasks/install.yml +++ b/ansible/roles/doca/tasks/install.yml @@ -39,13 +39,11 @@ - name: Cleanup DOCA build directories ansible.builtin.file: state: absent - path: "{{ (_doca_kernel_repo.stdout | split('/'))[:2] | join('/') }}" + path: "{{ (_doca_kernel_repo.stdout | split('/'))[:3] | join('/') }}" # leading / means 1st element of split list is '' - name: Update initramfs ansible.builtin.command: - cmd: dracut -f --tmpdir /var/tmp - environment: - TMPDIR: /var/tmp + cmd: dracut -f register: _doca_dracut failed_when: _doca_dracut.stderr != '' # appears rc is always 0 From d6eabe69270a8a3c7b15d0eb2628bb2393dc35b5 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Dec 2024 10:18:23 +0000 Subject: [PATCH 066/182] unused var --- ansible/roles/dnf_repos/defaults/main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 19a5d4986..3701305b6 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -2,7 +2,6 @@ dnf_repos_pulp_url: "{{ appliances_pulp_url }}" dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" -dnf_repos_version_timestamps: "{{ appliances_repo_minor_timestamps[ansible_distribution_version] }}" dnf_repos_username: "{{ omit }}" dnf_repos_password: "{{ omit }}" From 4a3074b9153d73f5cfc9c0c754546418f6e3b34a Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 13 Dec 2024 10:27:59 +0000 Subject: [PATCH 067/182] prototype script - hostvars no-op --- .../roles/compute_init/files/compute-init.yml | 40 +++++++++++++------ ansible/roles/compute_init/tasks/main.yml | 19 ++++++++- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index f78bbe9b7..e44ec32f8 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -5,14 +5,15 @@ become: yes # VARS TO BE SUPPLIED VIA CLOUD INIT METADATA vars: - control_node_ip: "172.16.1.228" + server_node_ip: "172.16.1.154" resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] - nfs_export: "/exports/home" - nfs_client_mnt_options: - nfs_client_mnt_point: "/home" - nfs_client_mnt_state: mounted - nfs_server: "{{ control_node_ip }}" + nfs_configurations: + - nfs_export: "/exports/home" + nfs_client_mnt_options: + nfs_client_mnt_point: "/home" + nfs_client_mnt_state: mounted + nfs_server: "{{ server_node_ip }}" os_manila_mount_state: mounted os_manila_mount_opts: @@ -36,7 +37,7 @@ uid: 1005 basic_users_groups: [] - openhpc_conf_server: "{{ control_node_ip }}" + openhpc_conf_server: "{{ server_node_ip }}" tasks: - name: Configure resolve.conf @@ -77,7 +78,7 @@ - name: Mount /mnt/cluster mount: path: /mnt/cluster - src: "{{ vars.control_node_ip }}:/exports/cluster" + src: "{{ vars.server_node_ip }}:/exports/cluster" fstype: nfs opts: rw,sync state: mounted @@ -90,20 +91,35 @@ group: root mode: 0644 + # - name: Include hostvars from NFS share + # block: + # - name: Extract short hostname using a shell block + # shell: | + # HOSTNAME=$(hostname) + # echo "${HOSTNAME%.test.invalid}" + # register: short_hostname + + # - name: Include vars from NFS mount + # include_vars: + # file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" + - name: NFS client mount block: - name: ensure mount directory exists file: - path: "{{ nfs_client_mnt_point }}" + path: "{{ item.get('nfs_client_mnt_point', nfs_client_mnt_point) }}" state: directory + loop: "{{ nfs_configurations }}" - name: mount the filesystem mount: - path: "{{ nfs_client_mnt_point }}" - src: "{{ nfs_server }}:{{ nfs_export }}" + path: "{{ item.get('nfs_client_mnt_point', nfs_client_mnt_point) }}" + src: "{{ item.get('nfs_server', nfs_server) }}:{{ item.get('nfs_export', nfs_export) }}" + opts: "{{ item['nfs_client_mnt_options'] | default(nfs_client_mnt_options, true) | default(omit, true) }}" # for some reason items.get() here fails with "an incorrect mount option was specified" fstype: nfs - state: "{{ nfs_client_mnt_state }}" + state: "{{ item.get('nfs_client_mnt_state', nfs_client_mnt_state) }}" + loop: "{{ nfs_configurations }}" - name: Manila mount diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index 15ba586d1..f5513a80a 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -1,7 +1,7 @@ --- - name: Ensure directories exist - file: + file: path: "/etc/ansible-init/{{ item }}" state: directory owner: root @@ -122,6 +122,23 @@ owner: munge group: munge mode: 0400 + + # - name: Ensure /exports/cluster/inventory_hostname directory exists + # file: + # path: /exports/cluster/{{ inventory_hostname }} + # state: directory + # owner: root + # group: root + # mode: 0755 + + # - name: Template hostvars + # template: + # src: ../templates/hostvars.j2 + # dest: "/exports/cluster/{{ inventory_hostname }}/hostvars.yml" + # owner: root + # group: root + # mode: 0644 + delegate_to: "{{ groups['control'] | first }}" - name: Inject compute initialisation playbook From 5a082e7d8584a7373f60a2de9208b0c08bdc5fd9 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:44:36 +0000 Subject: [PATCH 068/182] Fix nightly images getting timestamp/git hash (#493) --- .github/workflows/nightlybuild.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index a0e78cd0b..9cb1cea27 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -83,6 +83,7 @@ jobs: -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ -var "image_name=${{ matrix.build.image_name }}" \ + -var "image_name_version="\ -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl From 91fe707db39d47436b84c1c63b07a22e2aa606e1 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:08:30 +0000 Subject: [PATCH 069/182] Update nightlybuild.yml --- .github/workflows/nightlybuild.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 9cb1cea27..596b85a05 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -83,7 +83,7 @@ jobs: -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ -var "image_name=${{ matrix.build.image_name }}" \ - -var "image_name_version="\ + -var "image_name_version=" \ -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl From e3ce4926622f3a5fabd38eb704afd2dec4048cbe Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 13 Dec 2024 10:40:31 +0000 Subject: [PATCH 070/182] use k3s_server metadata for server_ip --- ansible/roles/compute_init/files/compute-init.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index e44ec32f8..164aab8e3 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -3,9 +3,10 @@ - name: Compute node initialisation hosts: localhost become: yes - # VARS TO BE SUPPLIED VIA CLOUD INIT METADATA vars: - server_node_ip: "172.16.1.154" + os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" + server_node_ip: "{{ os_metadata.meta.k3s_server }}" + resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] nfs_configurations: @@ -78,7 +79,7 @@ - name: Mount /mnt/cluster mount: path: /mnt/cluster - src: "{{ vars.server_node_ip }}:/exports/cluster" + src: "{{ server_node_ip }}:/exports/cluster" fstype: nfs opts: rw,sync state: mounted From f0e48b90a36b16ff6e5c80740ab07587d5fcd467 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Dec 2024 13:24:27 +0000 Subject: [PATCH 071/182] pulp sync now mirrors upstream subpaths --- ansible/roles/pulp_site/.gitignore | 1 + ansible/roles/pulp_site/defaults/main.yml | 22 ++++++++----------- .../filter_plugins/pulp-list-filters.py | 6 ++--- 3 files changed, 13 insertions(+), 16 deletions(-) create mode 100644 ansible/roles/pulp_site/.gitignore diff --git a/ansible/roles/pulp_site/.gitignore b/ansible/roles/pulp_site/.gitignore new file mode 100644 index 000000000..6738e49c1 --- /dev/null +++ b/ansible/roles/pulp_site/.gitignore @@ -0,0 +1 @@ +filter_plugins/__pycache__ \ No newline at end of file diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index c342ea46f..76ad14988 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -1,9 +1,10 @@ pulp_site_url: "{{ appliances_pulp_url }}" pulp_site_port: 8080 pulp_site_username: admin # shouldn't be changed +pulp_site_upstream_content_url: https://ark.stackhpc.com/pulp/content pulp_site_upstream_username: slurm-app-ci pulp_site_upstream_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" -pulp_site_default_upstream_prefix: "https://ark.stackhpc.com/pulp/content/{{ pulp_site_target_distribution }}/{{ pulp_site_target_distribution_version }}" +_pulp_site_rocky_prefix: "{{ pulp_site_target_distribution }}/{{ pulp_site_target_distribution_version }}" pulp_site_default_upstream_suffix: "{{ pulp_site_target_arch }}/os" pulp_site_password: "{{ vault_pulp_admin_password }}" pulp_site_validate_certs: false @@ -19,20 +20,15 @@ pulp_site_major_version_timestamps: "{{ appliances_repo_major_timestamps[pulp_si pulp_site_rpm_info: - name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" - url: "{{ pulp_site_default_upstream_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.baseos }}" - base_path: "rocky/{{ pulp_site_target_distribution_version }}/baseos/{{ pulp_site_version_timestamps.baseos }}" + subpath: "{{ _pulp_site_rocky_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.baseos }}" - name: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" - url: "{{ pulp_site_default_upstream_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.appstream }}" - base_path: "rocky/{{ pulp_site_target_distribution_version }}/appstream/{{ pulp_site_version_timestamps.appstream }}" + subpath: "{{ _pulp_site_rocky_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.appstream }}" - name: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" - url: "{{ pulp_site_default_upstream_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.crb }}" - base_path: "rocky/{{ pulp_site_target_distribution_version }}/crb/{{ pulp_site_version_timestamps.crb }}" + subpath: "{{ _pulp_site_rocky_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.crb }}" - name: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" - url: "{{ pulp_site_default_upstream_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" - base_path: "rocky/{{ pulp_site_target_distribution_version }}/extras/{{ pulp_site_version_timestamps.extras }}" -- name: "epel-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.epel }}" - url: "https://ark.stackhpc.com/pulp/content/epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_major_version_timestamps.epel }}" - base_path: "epel/{{ pulp_site_target_distribution_version }}/{{ pulp_site_version_timestamps.epel }}" + subpath: "{{ _pulp_site_rocky_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" +- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ pulp_site_major_version_timestamps.epel }}" + subpath: "epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_major_version_timestamps.epel }}" pulp_site_rpm_repo_defaults: remote_username: "{{ pulp_site_upstream_username }}" @@ -42,6 +38,6 @@ pulp_site_rpm_repo_defaults: _pulp_site_rpm_info_all: "{{ pulp_site_rpm_info | map('combine', pulp_site_rpm_repo_defaults) }}" -pulp_site_rpm_repos: "{{ _pulp_site_rpm_info_all | to_rpm_repos }}" +pulp_site_rpm_repos: "{{ _pulp_site_rpm_info_all | to_rpm_repos(pulp_site_upstream_content_url) }}" pulp_site_rpm_publications: "{{ _pulp_site_rpm_info_all | to_rpm_pubs }}" pulp_site_rpm_distributions: "{{ _pulp_site_rpm_info_all | to_rpm_distros }}" diff --git a/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py index 94d89d184..50e912685 100644 --- a/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py +++ b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py @@ -6,10 +6,10 @@ def filters(self): 'to_rpm_distros': self.to_rpm_distros } - def to_rpm_repos(self, list): + def to_rpm_repos(self, list, pulp_url): repo_list = map(lambda x: { 'name': x['name'], - 'url': x['url'], + 'url': pulp_url+'/'+x['subpath'], 'remote_username': x['remote_username'], 'remote_password': x['remote_password'], 'policy': x['policy'], @@ -26,6 +26,6 @@ def to_rpm_distros(self, list): distro_list = map(lambda x: { 'name': x['name'], 'repository': x['name'], - 'base_path': x['base_path'], + 'base_path': x['subpath'], 'state': x['state'] }, list) return distro_list \ No newline at end of file From 309bd0bd659a73a73aa3152cdcba56b32cc261a0 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Dec 2024 13:30:41 +0000 Subject: [PATCH 072/182] removed intermediate var --- ansible/roles/dnf_repos/defaults/main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 3701305b6..281a57c7e 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -1,5 +1,4 @@ -dnf_repos_pulp_url: "{{ appliances_pulp_url }}" -dnf_repos_pulp_content_url: "{{ dnf_repos_pulp_url }}/pulp/content" +dnf_repos_pulp_content_url: "{{ appliances_pulp_url }}/pulp/content" dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" dnf_repos_username: "{{ omit }}" From 9065bb6d98c45170b82e8f772254e4b5cd63aa78 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Dec 2024 14:13:09 +0000 Subject: [PATCH 073/182] bumped repo timestamps to latest --- .../common/inventory/group_vars/all/defaults.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index e1acdf19b..e84f1e6d1 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -84,11 +84,11 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us appliances_repo_minor_timestamps: '9.4': - baseos: 20240816T002610 - appstream: 20240816T002610 - crb: 20240816T002610 - extras: 20240816T002610 + baseos: 20241115T011711 + appstream: 20241112T003151 + crb: 20241115T003133 + extras: 20241118T002802 appliances_repo_major_timestamps: '9': - epel: 20240902T080424 + epel: 20241213T010218 From 7d7bc7376fa81077e955dddc1a5a98eaf8956c62 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:35:56 +0000 Subject: [PATCH 074/182] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 5c100f999..8659f3e90 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241212-1553-c74360bf", - "RL9": "openhpc-RL9-241212-1554-c74360bf" + "RL8": "openhpc-RL8-241213-1402-a2a705c9", + "RL9": "openhpc-RL9-241213-1402-a2a705c9" } } From cc81aeff7f57b265b8dcf51beb8316ae25eeeb3d Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:48:21 +0000 Subject: [PATCH 075/182] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 5c100f999..125180527 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241212-1553-c74360bf", - "RL9": "openhpc-RL9-241212-1554-c74360bf" + "RL8": "openhpc-RL8-241213-1416-9065bb6d", + "RL9": "openhpc-RL9-241213-1417-9065bb6d" } } From f343f67395a143493f2b48c5638c2b7a2e4101b3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 13 Dec 2024 15:02:01 +0000 Subject: [PATCH 076/182] moved to later in build/site and moved groups --- ansible/bootstrap.yml | 9 --------- ansible/fatimage.yml | 2 ++ ansible/packages.yml | 10 ++++++++++ ansible/site.yml | 1 + environments/common/inventory/groups | 4 +--- environments/common/layouts/everything | 4 ++++ 6 files changed, 18 insertions(+), 12 deletions(-) create mode 100644 ansible/packages.yml diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 432a2a319..733d4b3f8 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -216,15 +216,6 @@ msg: "{{ updates.results | length }} changes to packages - see {{ update_log_path }} for details" when: "update_enable | default('false') | bool" -- hosts: extra_packages - become: yes - tags: - - extra_packages - tasks: - dnf: - - name: "{{ appliances_extra_packages }}" - when: appliances_mode != 'configure' or appliances_packages_during_configure - - hosts: - selinux - update diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index b28e4f308..c40aca6fd 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -199,6 +199,8 @@ name: cloudalchemy.grafana tasks_from: install.yml +- ansible.builtin.import_playbook: packages.yml + - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/packages.yml b/ansible/packages.yml new file mode 100644 index 000000000..e447dcda7 --- /dev/null +++ b/ansible/packages.yml @@ -0,0 +1,10 @@ + +- hosts: extra_packages + become: yes + tags: + - extra_packages + tasks: + - name: Install additional packages + dnf: + name: "{{ appliances_extra_packages }}" + when: appliances_mode != 'configure' or appliances_packages_during_configure diff --git a/ansible/site.yml b/ansible/site.yml index bb379399d..878b15a35 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -27,6 +27,7 @@ - import_playbook: slurm.yml - import_playbook: portal.yml - import_playbook: monitoring.yml +- import_playbook: packages.yml - name: Run post.yml hook vars: diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index d8ad503fe..2a6244962 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -145,7 +145,5 @@ freeipa_client [lustre] # Hosts to run lustre client -[extra_packages:children] +[extra_packages] # Hosts to install specified additional packages on -cluster -builder diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index ba5cbc08d..6f6f63590 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -92,3 +92,7 @@ control [lustre] # Hosts to run lustre client + +[extra_packages:children] +# Hosts to install specified additional packages on +cluster From 07ed8223147d4cfb40fbf557766920fea02b5260 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 13 Dec 2024 16:11:44 +0000 Subject: [PATCH 077/182] compute init node condition based off metadata --- .../roles/compute_init/files/compute-init.yml | 19 +++++++++++++++++-- environments/common/layouts/everything | 2 +- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 164aab8e3..3e9c6e470 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -5,6 +5,7 @@ become: yes vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" + iam_slurm_compute: "{{ os_metadata.meta.slurm_compute | default(false) }}" server_node_ip: "{{ os_metadata.meta.k3s_server }}" resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] @@ -41,6 +42,11 @@ openhpc_conf_server: "{{ server_node_ip }}" tasks: + - name: Skip initialization if slurm_compute metadata set to false + debug: + msg: "Skipping compute initialization" + when: not iam_slurm_compute | bool + - name: Configure resolve.conf block: - name: Set nameservers in /etc/resolv.conf @@ -65,7 +71,9 @@ name: NetworkManager state: reloaded when: _copy_nm_config.changed | default(false) - when: resolv_conf_nameservers is defined and resolv_conf_nameservers | length > 0 + when: + - resolv_conf_nameservers is defined and resolv_conf_nameservers | length > 0 + - iam_slurm_compute | bool - name: Mount /mnt/cluster on compute nodes and copy hosts to /etc/hosts @@ -91,6 +99,7 @@ owner: root group: root mode: 0644 + when: iam_slurm_compute | bool # - name: Include hostvars from NFS share # block: @@ -103,6 +112,7 @@ # - name: Include vars from NFS mount # include_vars: # file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" + # when: iam_slurm_compute | bool - name: NFS client mount @@ -121,6 +131,7 @@ fstype: nfs state: "{{ item.get('nfs_client_mnt_state', nfs_client_mnt_state) }}" loop: "{{ nfs_configurations }}" + when: iam_slurm_compute | bool - name: Manila mount @@ -205,6 +216,7 @@ loop_control: label: "{{ item.share_name }}" when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] + when: iam_slurm_compute | bool - name: Basic users setup @@ -229,6 +241,7 @@ loop_control: label: "{{ item.name }}" when: "'sudo' in item" + when: iam_slurm_compute | bool - name: Configure EESSI @@ -245,6 +258,7 @@ - name: Ensure CVMFS config is setup command: cmd: "cvmfs_config setup" + when: iam_slurm_compute | bool - name: Configure openhpc @@ -285,4 +299,5 @@ service: name: slurmd enabled: true - state: started \ No newline at end of file + state: started + when: iam_slurm_compute | bool diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 2d55c18cf..5ada017e1 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -84,7 +84,7 @@ cluster [compute_init:children] # Hosts to deploy compute initialisation ansible-init script to. -compute +cluster [k3s:children] # Hosts to run k3s server/agent From a43a5f97ee62a2ef0283dee3db3df10e34429333 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 13 Dec 2024 17:11:17 +0000 Subject: [PATCH 078/182] fail gracefully when NFS server not up --- .../roles/compute_init/files/compute-init.yml | 53 +++++++++++++------ 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 3e9c6e470..9b098fc14 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -91,6 +91,13 @@ fstype: nfs opts: rw,sync state: mounted + register: nfs_mount_result + ignore_errors: true + + - name: Fail gracefully if NFS mount is not available + debug: + msg: "NFS mount failed. Skipping compute initialization. Re-image if this persists." + when: nfs_mount_result.failed - name: Copy /mnt/cluster/hosts contents to /etc/hosts copy: @@ -99,20 +106,24 @@ owner: root group: root mode: 0644 + when: not nfs_mount_result.failed when: iam_slurm_compute | bool - # - name: Include hostvars from NFS share - # block: - # - name: Extract short hostname using a shell block - # shell: | - # HOSTNAME=$(hostname) - # echo "${HOSTNAME%.test.invalid}" - # register: short_hostname - # - name: Include vars from NFS mount - # include_vars: - # file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" - # when: iam_slurm_compute | bool + - name: Include hostvars from NFS share + block: + - name: Extract short hostname using a shell block + shell: | + HOSTNAME=$(hostname) + echo "${HOSTNAME%.test.invalid}" + register: short_hostname + + - name: Include vars from NFS mount + include_vars: + file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed - name: NFS client mount @@ -131,7 +142,9 @@ fstype: nfs state: "{{ item.get('nfs_client_mnt_state', nfs_client_mnt_state) }}" loop: "{{ nfs_configurations }}" - when: iam_slurm_compute | bool + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed - name: Manila mount @@ -216,7 +229,9 @@ loop_control: label: "{{ item.share_name }}" when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] - when: iam_slurm_compute | bool + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed - name: Basic users setup @@ -241,7 +256,9 @@ loop_control: label: "{{ item.name }}" when: "'sudo' in item" - when: iam_slurm_compute | bool + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed - name: Configure EESSI @@ -258,7 +275,9 @@ - name: Ensure CVMFS config is setup command: cmd: "cvmfs_config setup" - when: iam_slurm_compute | bool + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed - name: Configure openhpc @@ -300,4 +319,6 @@ name: slurmd enabled: true state: started - when: iam_slurm_compute | bool + when: + - iam_slurm_compute | bool + - not nfs_mount_result.failed \ No newline at end of file From 76f292e2da4d578a92434da90c98783b69aa7398 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 13 Dec 2024 17:20:34 +0000 Subject: [PATCH 079/182] rejoin node to cluster --- ansible/roles/compute_init/files/compute-init.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 9b098fc14..165700668 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -118,9 +118,9 @@ echo "${HOSTNAME%.test.invalid}" register: short_hostname - - name: Include vars from NFS mount - include_vars: - file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" + # - name: Include vars from NFS mount + # include_vars: + # file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" when: - iam_slurm_compute | bool - not nfs_mount_result.failed @@ -319,6 +319,9 @@ name: slurmd enabled: true state: started + + - name: Ensure node is in cluster + command: scontrol update state=resume nodename={{ short_hostname.stdout }} when: - iam_slurm_compute | bool - not nfs_mount_result.failed \ No newline at end of file From 1a400db7a5a73eb73c8707e384ceafa7b6f5f544 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 13 Dec 2024 20:38:19 +0000 Subject: [PATCH 080/182] ok: Skipping compute initialization as metadata compute_groups is empty --- ansible/extras.yml | 17 +- .../roles/compute_init/files/compute-init.yml | 385 ++++-------------- ansible/roles/compute_init/tasks/export.yml | 35 ++ ansible/roles/compute_init/tasks/install.yml | 73 ++++ ansible/roles/compute_init/tasks/main.yml | 8 - .../compute_init/templates/hostvars.yml.j2 | 1 + environments/common/layouts/everything | 5 +- 7 files changed, 197 insertions(+), 327 deletions(-) create mode 100644 ansible/roles/compute_init/tasks/export.yml create mode 100644 ansible/roles/compute_init/tasks/install.yml create mode 100644 ansible/roles/compute_init/templates/hostvars.yml.j2 diff --git a/ansible/extras.yml b/ansible/extras.yml index 4cbe931b1..85e068e89 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -37,13 +37,26 @@ - import_role: name: persist_hostkeys -- name: Inject ansible-init compute script +# TODO: I'm not convinced this is the right place +- hosts: compute_init:!builder + tags: compute_init + become: yes + name: Export hostvars + tasks: + - include_role: + name: compute_init + tasks_from: export.yml + +# TODO: really this should only run during build +# but handy not to for debugging +- name: Install compute_init script hosts: compute_init tags: compute_init become: yes tasks: - - import_role: + - include_role: name: compute_init + tasks_from: install.yml - name: Install k9s become: yes diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 165700668..53071cc48 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -5,323 +5,78 @@ become: yes vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" - iam_slurm_compute: "{{ os_metadata.meta.slurm_compute | default(false) }}" server_node_ip: "{{ os_metadata.meta.k3s_server }}" - - resolv_conf_nameservers: [1.1.1.1, 8.8.8.8] - - nfs_configurations: - - nfs_export: "/exports/home" - nfs_client_mnt_options: - nfs_client_mnt_point: "/home" - nfs_client_mnt_state: mounted - nfs_server: "{{ server_node_ip }}" - - os_manila_mount_state: mounted - os_manila_mount_opts: - - x-systemd.device-timeout=30 - - x-systemd.mount-timeout=30 - - noatime - - _netdev # prevents mount blocking early boot before networking available - - rw - os_manila_mount_ceph_conf_path: /etc/ceph - - basic_users_manage_homedir: false - basic_users_userdefaults: - state: present - create_home: "{{ basic_users_manage_homedir }}" - generate_ssh_key: "{{ basic_users_manage_homedir }}" - ssh_key_comment: "{{ item.name }}" - test_user_password: "zXpcWyGQL7jtZnqylQra4g==" - basic_users_users: - - name: testuser # can't use rocky as $HOME isn't shared! - password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent - uid: 1005 - basic_users_groups: [] - - openhpc_conf_server: "{{ server_node_ip }}" + compute_groups: "{{ os_metadata.meta.compute_groups | default([]) }}" + + # TODO: "role defaults" + resolv_conf_nameservers: [] + + # nfs_configurations: + # - nfs_export: "/exports/home" + # nfs_client_mnt_options: + # nfs_client_mnt_point: "/home" + # nfs_client_mnt_state: mounted + # nfs_server: "{{ server_node_ip }}" + + # os_manila_mount_state: mounted + # os_manila_mount_opts: + # - x-systemd.device-timeout=30 + # - x-systemd.mount-timeout=30 + # - noatime + # - _netdev # prevents mount blocking early boot before networking available + # - rw + # os_manila_mount_ceph_conf_path: /etc/ceph + + # basic_users_manage_homedir: false + # basic_users_userdefaults: + # state: present + # create_home: "{{ basic_users_manage_homedir }}" + # generate_ssh_key: "{{ basic_users_manage_homedir }}" + # ssh_key_comment: "{{ item.name }}" + # test_user_password: "zXpcWyGQL7jtZnqylQra4g==" + # basic_users_users: + # - name: testuser # can't use rocky as $HOME isn't shared! + # password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent + # uid: 1005 + # basic_users_groups: [] + + # openhpc_conf_server: "{{ server_node_ip }}" tasks: - - name: Skip initialization if slurm_compute metadata set to false - debug: - msg: "Skipping compute initialization" - when: not iam_slurm_compute | bool - - - name: Configure resolve.conf - block: - - name: Set nameservers in /etc/resolv.conf - ansible.builtin.template: - src: /etc/ansible-init/templates/resolv.conf.j2 - dest: /etc/resolv.conf - owner: root - group: root - mode: u=rw,og=r - - - name: Disable NetworkManager control of resolv.conf - ansible.builtin.copy: - src: /etc/ansible-init/files/NetworkManager-dns-none.conf - dest: /etc/NetworkManager/conf.d/90-dns-none.conf - owner: root - group: root - mode: u=rw,og=r - register: _copy_nm_config - - - name: Reload NetworkManager - ansible.builtin.systemd: - name: NetworkManager - state: reloaded - when: _copy_nm_config.changed | default(false) - when: - - resolv_conf_nameservers is defined and resolv_conf_nameservers | length > 0 - - iam_slurm_compute | bool - - - - name: Mount /mnt/cluster on compute nodes and copy hosts to /etc/hosts - block: - - name: Ensure the mount directory exists - file: - path: /mnt/cluster - state: directory - mode: 0755 + - block: + - name: Report skipping initialization if not compute node + # meta: end_play produces no output + debug: + msg: "Skipping compute initialization as metadata compute_groups is empty" - - name: Mount /mnt/cluster - mount: - path: /mnt/cluster - src: "{{ server_node_ip }}:/exports/cluster" - fstype: nfs - opts: rw,sync - state: mounted - register: nfs_mount_result - ignore_errors: true - - - name: Fail gracefully if NFS mount is not available + - meta: end_play + when: compute_groups | length == 0 + + - name: Ensure the mount directory exists + file: + path: /mnt/cluster + state: directory + owner: root + group: root + mode: u=rwX,go= # is sensitive + + - name: Mount /mnt/cluster + mount: + path: /mnt/cluster + src: "{{ server_node_ip }}:/exports/cluster" + fstype: nfs + opts: ro,sync + state: mounted + register: nfs_mount_result + ignore_errors: true + register: _mount_mnt_cluster + # TODO: add some retries here? + + - block: + - name: Report skipping initialization if cannot mount nfs + # meta: end_play produces no output debug: - msg: "NFS mount failed. Skipping compute initialization. Re-image if this persists." - when: nfs_mount_result.failed - - - name: Copy /mnt/cluster/hosts contents to /etc/hosts - copy: - src: /mnt/cluster/hosts - dest: /etc/hosts - owner: root - group: root - mode: 0644 - when: not nfs_mount_result.failed - when: iam_slurm_compute | bool - - - - name: Include hostvars from NFS share - block: - - name: Extract short hostname using a shell block - shell: | - HOSTNAME=$(hostname) - echo "${HOSTNAME%.test.invalid}" - register: short_hostname - - # - name: Include vars from NFS mount - # include_vars: - # file: "/mnt/cluster/{{ short_hostname.stdout }}/hostvars.yml" - when: - - iam_slurm_compute | bool - - not nfs_mount_result.failed - - - - name: NFS client mount - block: - - name: ensure mount directory exists - file: - path: "{{ item.get('nfs_client_mnt_point', nfs_client_mnt_point) }}" - state: directory - loop: "{{ nfs_configurations }}" - - - name: mount the filesystem - mount: - path: "{{ item.get('nfs_client_mnt_point', nfs_client_mnt_point) }}" - src: "{{ item.get('nfs_server', nfs_server) }}:{{ item.get('nfs_export', nfs_export) }}" - opts: "{{ item['nfs_client_mnt_options'] | default(nfs_client_mnt_options, true) | default(omit, true) }}" # for some reason items.get() here fails with "an incorrect mount option was specified" - fstype: nfs - state: "{{ item.get('nfs_client_mnt_state', nfs_client_mnt_state) }}" - loop: "{{ nfs_configurations }}" - when: - - iam_slurm_compute | bool - - not nfs_mount_result.failed - - - - name: Manila mount - block: - - name: Read manila share info from nfs file - slurp: - src: "/mnt/cluster/manila_share_info.yml" - register: manila_share_info_file - no_log: true - - - name: Parse and set fact for manila share info - set_fact: - os_manila_mount_share_info: "{{ manila_share_info_file.content | b64decode | from_yaml }}" - - - name: Read manila shares from nfs file - slurp: - src: "/mnt/cluster/manila_shares.yml" - register: manila_shares_file - - - name: Parse and set fact for manila shares - set_fact: - os_manila_mount_shares: "{{ manila_shares_file.content | b64decode | from_yaml }}" - - - name: Ensure Ceph configuration directory exists - ansible.builtin.file: - path: "{{ os_manila_mount_ceph_conf_path }}" - state: directory - mode: "0755" - owner: root - group: root - - - name: Configure ceph.conf using os_manila_mount_host - ansible.builtin.template: - src: /etc/ansible-init/templates/ceph.conf.j2 - dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.conf" - owner: root - group: root - mode: "0600" - - - name: Ensure mount directory exists - ansible.builtin.file: - path: "{{ item.mount_path }}" - state: directory - owner: "{{ item.mount_user | default(omit) }}" - group: "{{ item.mount_group | default(omit) }}" - mode: "{{ item.mount_mode | default(omit) }}" - loop: "{{ os_manila_mount_shares }}" - loop_control: - label: "{{ item.share_name }}" - - - name: Write Ceph client keyring - ansible.builtin.template: - src: /etc/ansible-init/templates/ceph.keyring.j2 - dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.client.{{ item.share_user }}.keyring" - mode: "0600" - owner: root - group: root - loop: "{{ os_manila_mount_share_info }}" - loop_control: - label: "{{ item.share_name }}" - - - name: Mount the Ceph share - ansible.posix.mount: - path: "{{ item[0].mount_path }}" - src: "{{ item[1].host }}:{{ item[1].export }}" - fstype: ceph - opts: "name={{ item[1].share_user }},{{ (item[0].mount_opts | default(os_manila_mount_opts)) | join(',') }}" - # NB share_user is looked up here in case of autodetection - state: "{{ item[0].mount_state | default(os_manila_mount_state) }}" - loop: "{{ os_manila_mount_shares | zip(os_manila_mount_share_info) }}" - loop_control: - label: "{{ item[0].share_name }}" - - - name: Ensure mounted directory has correct permissions - ansible.builtin.file: - path: "{{ item.mount_path }}" - state: directory - owner: "{{ item.mount_user | default(omit) }}" - group: "{{ item.mount_group | default(omit) }}" - mode: "{{ item.mount_mode | default(omit) }}" - loop: "{{ os_manila_mount_shares }}" - loop_control: - label: "{{ item.share_name }}" - when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] - when: - - iam_slurm_compute | bool - - not nfs_mount_result.failed - - - - name: Basic users setup - block: - - name: Create groups - ansible.builtin.group: "{{ item }}" - loop: "{{ basic_users_groups }}" - - - name: Create users - user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}" - loop: "{{ basic_users_users }}" - loop_control: - label: "{{ item.name }} [{{ item.state | default('present') }}]" - register: basic_users_info - - - name: Write sudo rules - blockinfile: - path: /etc/sudoers.d/80-{{ item.name}}-user - block: "{{ item.sudo }}" - create: true - loop: "{{ basic_users_users }}" - loop_control: - label: "{{ item.name }}" - when: "'sudo' in item" - when: - - iam_slurm_compute | bool - - not nfs_mount_result.failed - - - - name: Configure EESSI - block: - - name: Copy /mnt/cluster/cvmfs/default.local contents to /etc/cvmfs/default.local - copy: - src: /mnt/cluster/cvmfs/default.local - dest: /etc/cvmfs/default.local - owner: root - group: root - mode: 0644 - - # NOTE: Not clear how to make this idempotent - - name: Ensure CVMFS config is setup - command: - cmd: "cvmfs_config setup" - when: - - iam_slurm_compute | bool - - not nfs_mount_result.failed - - - - name: Configure openhpc - block: - - name: Fix permissions on /etc to pass Munge startup checks - # Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 makes /etc g=rwx rather than g=rx (where group=root) - # which fails munged startup checks - file: - path: /etc - state: directory - mode: g-w - - - name: Copy Munge key from NFS-mounted directory to /etc/munge - copy: - src: "/mnt/cluster/openhpc_munge.key" - dest: "/etc/munge/munge.key" - owner: munge - group: munge - mode: 0400 - - - name: Set slurmctld location for configless operation - lineinfile: - path: /etc/sysconfig/slurmd - line: "SLURMD_OPTIONS='--conf-server {{ openhpc_conf_server }}'" - regexp: "^SLURMD_OPTIONS=" - create: yes - owner: root - group: root - mode: 0644 - - - name: Configure Munge service - service: - name: munge - enabled: true - state: started - - - name: Ensure slurmd state - service: - name: slurmd - enabled: true - state: started - - - name: Ensure node is in cluster - command: scontrol update state=resume nodename={{ short_hostname.stdout }} - when: - - iam_slurm_compute | bool - - not nfs_mount_result.failed \ No newline at end of file + msg: "Skipping compute initialization as cannot mount exports/cluster share" + + - meta: end_play + when: _mount_mnt_cluster.failed diff --git a/ansible/roles/compute_init/tasks/export.yml b/ansible/roles/compute_init/tasks/export.yml new file mode 100644 index 000000000..3e9340cb5 --- /dev/null +++ b/ansible/roles/compute_init/tasks/export.yml @@ -0,0 +1,35 @@ +- name: Ensure the /exports/cluster directory exists + file: + path: /exports/cluster + state: directory + owner: root + group: root + mode: u=rwX,go= + run_once: true + delegate_to: "{{ groups['control'] | first }}" + +- name: Copy /etc/hosts to /exports/cluster + copy: + src: /etc/hosts + dest: /exports/cluster/hosts + owner: root + group: root + mode: u=rw,go= + remote_src: true + run_once: true + delegate_to: "{{ groups['control'] | first }}" + +- name: Create hostvars directory + file: + path: /exports/cluster/hostvars/{{ inventory_hostname }}/ + state: directory + mode: u=rwX,go= + # TODO: owner,mode,etc + delegate_to: "{{ groups['control'] | first }}" + +- name: Template out hostvars + template: + src: hostvars.yml.j2 + dest: /exports/cluster/hostvars/{{ inventory_hostname }}/hostvars.yml + mode: u=rw,go= + delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml new file mode 100644 index 000000000..c48ec612d --- /dev/null +++ b/ansible/roles/compute_init/tasks/install.yml @@ -0,0 +1,73 @@ +--- + +- name: Ensure directories exist + file: + path: "/etc/ansible-init/{{ item }}" + state: directory + owner: root + group: root + mode: 0755 + loop: + - templates + - files + - library + - filter_plugins + +- name: Inject templates + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/templates/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../resolv_conf/templates/resolv.conf.j2 + - ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 + - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 + +- name: Inject files + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/files/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../resolv_conf/files/NetworkManager-dns-none.conf + +- name: Inject libraries + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/library/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../stackhpc.os-manila-mount/library/os_manila_share.py + +- name: Inject filter_plugins + copy: + src: '{{ item }}' + dest: '/etc/ansible-init/filter_plugins/{{ item | basename }}' + owner: root + group: root + mode: 0644 + loop: + - ../../basic_users/filter_plugins/filter_keys.py + +- name: Add filter_plugins ansible.cfg + lineinfile: + path: /etc/ansible-init/ansible.cfg + line: "filter_plugins = /etc/ansible-init/filter_plugins" + state: present + owner: root + group: root + mode: 0644 + +- name: Inject compute initialisation playbook + copy: + src: compute-init.yml + dest: /etc/ansible-init/playbooks/1-compute-init.yml + owner: root + group: root + mode: 0644 \ No newline at end of file diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml index f5513a80a..cb4c57d35 100644 --- a/ansible/roles/compute_init/tasks/main.yml +++ b/ansible/roles/compute_init/tasks/main.yml @@ -140,11 +140,3 @@ # mode: 0644 delegate_to: "{{ groups['control'] | first }}" - -- name: Inject compute initialisation playbook - copy: - src: compute-init.yml - dest: /etc/ansible-init/playbooks/compute-init.yml - owner: root - group: root - mode: 0644 \ No newline at end of file diff --git a/ansible/roles/compute_init/templates/hostvars.yml.j2 b/ansible/roles/compute_init/templates/hostvars.yml.j2 new file mode 100644 index 000000000..7d4351b44 --- /dev/null +++ b/ansible/roles/compute_init/templates/hostvars.yml.j2 @@ -0,0 +1 @@ +{{ hostvars[inventory_hostname] | to_nice_json }} \ No newline at end of file diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 5ada017e1..0fc447cf5 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -83,8 +83,9 @@ openhpc cluster [compute_init:children] -# Hosts to deploy compute initialisation ansible-init script to. -cluster +# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on +# TODO: actually should be empty for now +compute [k3s:children] # Hosts to run k3s server/agent From c9ebd482da31417a8db47a4101d2964fdebf75bf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 13 Dec 2024 20:53:21 +0000 Subject: [PATCH 081/182] compute-init stage 1 working --- ansible/extras.yml | 2 +- docs/experimental/compute-init.md | 100 ++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 docs/experimental/compute-init.md diff --git a/ansible/extras.yml b/ansible/extras.yml index 85e068e89..e615b1605 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -48,7 +48,7 @@ tasks_from: export.yml # TODO: really this should only run during build -# but handy not to for debugging +# but handy not to for debugging without build - name: Install compute_init script hosts: compute_init tags: compute_init diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md new file mode 100644 index 000000000..1e3f07c1f --- /dev/null +++ b/docs/experimental/compute-init.md @@ -0,0 +1,100 @@ + +To develop/debug this without actually having to build an image: + +On deploy host: + + .stackhpc/ (venv) [rocky@steveb-dev slurm-app-rl9]$ ansible-playbook ansible/extras.yml --tags compute_init + +On compute node: + + [root@rl9-compute-0 rocky]# rm /var/lib/ansible-init.done + [root@rl9-compute-0 rocky]# systemctl restart ansible-init + [root@rl9-compute-0 rocky]# systemctl status ansible-init + + +Without any metadata: + + [root@rl9-compute-0 rocky]# systemctl status ansible-init + ● ansible-init.service + Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) + Active: activating (start) since Fri 2024-12-13 20:41:16 UTC; 1min 45s ago + Main PID: 16089 (ansible-init) + Tasks: 8 (limit: 10912) + Memory: 99.5M + CPU: 11.687s + CGroup: /system.slice/ansible-init.service + ├─16089 /usr/lib/ansible-init/bin/python /usr/bin/ansible-init + ├─16273 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml + ├─16350 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml + ├─16361 /bin/sh -c "/usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py && sleep 0" + ├─16362 /usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py + ├─16363 /usr/bin/mount /mnt/cluster + └─16364 /sbin/mount.nfs 192.168.10.12:/exports/cluster /mnt/cluster -o ro,sync + + Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] + Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Report skipping initialization if not compute node] ********************** + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Ensure the mount directory exists] *************************************** + Dec 13 20:41:25 rl9-compute-0.rl9.invalid python3[16346]: ansible-file Invoked with path=/mnt/cluster state=directory owner=root group=root mode=u=rwX,go= recurse=False force=False follow=True modification_time_format=%Y%m%d%H%M.%S access> + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: changed: [127.0.0.1] + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Mount /mnt/cluster] ****************************************************** + Dec 13 20:41:26 rl9-compute-0.rl9.invalid python3[16362]: ansible-mount Invoked with path=/mnt/cluster src=192.168.10.12:/exports/cluster fstype=nfs opts=ro,sync state=mounted boot=True dump=0 passno=0 backup=False fstab=None + [root@rl9-compute-0 rocky]# systemctl status ansible-init + +Added metadata via horizon: + + compute_groups ["compute"] + + +OK: + + [root@rl9-compute-0 rocky]# systemctl status ansible-init + ● ansible-init.service + Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) + Active: active (exited) since Fri 2024-12-13 20:43:31 UTC; 33s ago + Process: 16089 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) + Main PID: 16089 (code=exited, status=0/SUCCESS) + CPU: 13.003s + + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] => { + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: "msg": "Skipping compute initialization as cannot mount exports/cluster share" + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: } + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: PLAY RECAP ********************************************************************* + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: 127.0.0.1 : ok=4 changed=1 unreachable=0 failed=0 skipped=1 rescued=0 ignored=1 + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] executing remote playbooks for stage - post + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] writing sentinel file /var/lib/ansible-init.done + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] ansible-init completed successfully + Dec 13 20:43:31 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. + +Now run site.yml, then restart ansible-init again: + + + [root@rl9-compute-0 rocky]# systemctl status ansible-init + ● ansible-init.service + Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) + Active: active (exited) since Fri 2024-12-13 20:50:10 UTC; 11s ago + Process: 18921 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) + Main PID: 18921 (code=exited, status=0/SUCCESS) + CPU: 8.240s + + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [Report skipping initialization if cannot mount nfs] ********************** + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [meta] ******************************************************************** + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: PLAY RECAP ********************************************************************* + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: 127.0.0.1 : ok=3 changed=1 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] executing remote playbooks for stage - post + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] writing sentinel file /var/lib/ansible-init.done + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] ansible-init completed successfully + Dec 13 20:50:10 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. + [root@rl9-compute-0 rocky]# ls /mnt/cluster/host + hosts hostvars/ + [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- + rl9-compute-0/ rl9-compute-1/ + [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- + rl9-compute-0/ rl9-compute-1/ + [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-0/ + hostvars.yml \ No newline at end of file From 3a583a95dfe52ecea141be9b47ae9630ed26c5da Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 13 Dec 2024 21:08:34 +0000 Subject: [PATCH 082/182] load hostvars --- .../roles/compute_init/files/compute-init.yml | 45 +++++-------------- docs/experimental/compute-init.md | 27 +++++++++-- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 53071cc48..5661b467d 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -8,40 +8,10 @@ server_node_ip: "{{ os_metadata.meta.k3s_server }}" compute_groups: "{{ os_metadata.meta.compute_groups | default([]) }}" - # TODO: "role defaults" + # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects + # this is a good example: common environment actually defines this (non-functional w/o compute groups), but role default is empty resolv_conf_nameservers: [] - - # nfs_configurations: - # - nfs_export: "/exports/home" - # nfs_client_mnt_options: - # nfs_client_mnt_point: "/home" - # nfs_client_mnt_state: mounted - # nfs_server: "{{ server_node_ip }}" - - # os_manila_mount_state: mounted - # os_manila_mount_opts: - # - x-systemd.device-timeout=30 - # - x-systemd.mount-timeout=30 - # - noatime - # - _netdev # prevents mount blocking early boot before networking available - # - rw - # os_manila_mount_ceph_conf_path: /etc/ceph - - # basic_users_manage_homedir: false - # basic_users_userdefaults: - # state: present - # create_home: "{{ basic_users_manage_homedir }}" - # generate_ssh_key: "{{ basic_users_manage_homedir }}" - # ssh_key_comment: "{{ item.name }}" - # test_user_password: "zXpcWyGQL7jtZnqylQra4g==" - # basic_users_users: - # - name: testuser # can't use rocky as $HOME isn't shared! - # password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent - # uid: 1005 - # basic_users_groups: [] - - # openhpc_conf_server: "{{ server_node_ip }}" - + tasks: - block: - name: Report skipping initialization if not compute node @@ -80,3 +50,12 @@ - meta: end_play when: _mount_mnt_cluster.failed + + - name: Load hostvars from NFS + # this is higher priority than vars block = normal ansible's hostvars + include_vars: + file: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" # can't use inventory_hostname + + - name: Demonstrate hostvars have loaded + debug: + var: prometheus_version diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index 1e3f07c1f..efc0cdcd9 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -7,8 +7,7 @@ On deploy host: On compute node: - [root@rl9-compute-0 rocky]# rm /var/lib/ansible-init.done - [root@rl9-compute-0 rocky]# systemctl restart ansible-init + [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init [root@rl9-compute-0 rocky]# systemctl status ansible-init @@ -97,4 +96,26 @@ Now run site.yml, then restart ansible-init again: [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- rl9-compute-0/ rl9-compute-1/ [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-0/ - hostvars.yml \ No newline at end of file + hostvars.yml + +This commit - shows that hostvars have loaded: + + [root@rl9-compute-0 rocky]# systemctl status ansible-init + ● ansible-init.service + Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) + Active: active (exited) since Fri 2024-12-13 21:06:20 UTC; 5s ago + Process: 27585 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) + Main PID: 27585 (code=exited, status=0/SUCCESS) + CPU: 8.161s + + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: TASK [Demonstrate hostvars have loaded] **************************************** + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: ok: [127.0.0.1] => { + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: "prometheus_version": "2.27.0" + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: } + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: PLAY RECAP ********************************************************************* + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: 127.0.0.1 : ok=5 changed=0 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] executing remote playbooks for stage - post + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] writing sentinel file /var/lib/ansible-init.done + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] ansible-init completed successfully + Dec 13 21:06:20 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. + From 8bb90b4b61efa94196976e6ad9b69ddbd8c1f4fc Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 13 Dec 2024 21:15:24 +0000 Subject: [PATCH 083/182] simplify compute-init file copy --- ansible/roles/compute_init/tasks/install.yml | 55 ++++++-------------- 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index c48ec612d..4eef5deb8 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -13,49 +13,28 @@ - library - filter_plugins -- name: Inject templates +- name: Inject files from roles copy: - src: '{{ item }}' - dest: '/etc/ansible-init/templates/{{ item | basename }}' + src: '{{ item.src }}' + dest: '/etc/ansible-init/{{ item.dest }}' owner: root group: root mode: 0644 loop: - - ../../resolv_conf/templates/resolv.conf.j2 - - ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 - - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 - -- name: Inject files - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/files/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../resolv_conf/files/NetworkManager-dns-none.conf - -- name: Inject libraries - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/library/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../stackhpc.os-manila-mount/library/os_manila_share.py - -- name: Inject filter_plugins - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/filter_plugins/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../basic_users/filter_plugins/filter_keys.py + - src: ../../resolv_conf/templates/resolv.conf.j2 + dest: templates/resolv.conf.j2 + - src: ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 + dest: templates/ceph.conf.j2 + - src: ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 + dest: templates/ceph.keyring.j2 + - src: ../../resolv_conf/files/NetworkManager-dns-none.conf + dest: files/NetworkManager-dns-none.conf + - src: ../../stackhpc.os-manila-mount/library/os_manila_share.py + dest: library/os_manila_share.py + - src: ../../basic_users/filter_plugins/filter_keys.py + dest: filter_plugins/filter_keys.py -- name: Add filter_plugins ansible.cfg +- name: Add filter_plugins to ansible.cfg lineinfile: path: /etc/ansible-init/ansible.cfg line: "filter_plugins = /etc/ansible-init/filter_plugins" @@ -64,7 +43,7 @@ group: root mode: 0644 -- name: Inject compute initialisation playbook +- name: Add compute initialisation playbook copy: src: compute-init.yml dest: /etc/ansible-init/playbooks/1-compute-init.yml From 7babc210ce686a160e6461f8ecf4bd67dd963b4f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Sat, 14 Dec 2024 21:38:11 +0000 Subject: [PATCH 084/182] move compute_init tasks to right place and document --- ansible/extras.yml | 13 +- ansible/fatimage.yml | 10 ++ ansible/roles/compute_init/tasks/main.yml | 142 ------------------ docs/experimental/compute-init.md | 42 +++++- .../common/inventory/group_vars/all/nfs.yml | 2 +- 5 files changed, 49 insertions(+), 160 deletions(-) delete mode 100644 ansible/roles/compute_init/tasks/main.yml diff --git a/ansible/extras.yml b/ansible/extras.yml index e615b1605..10ca4cc9d 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -37,7 +37,7 @@ - import_role: name: persist_hostkeys -# TODO: I'm not convinced this is the right place +# TODO: Is this is the right place? - hosts: compute_init:!builder tags: compute_init become: yes @@ -47,17 +47,6 @@ name: compute_init tasks_from: export.yml -# TODO: really this should only run during build -# but handy not to for debugging without build -- name: Install compute_init script - hosts: compute_init - tags: compute_init - become: yes - tasks: - - include_role: - name: compute_init - tasks_from: install.yml - - name: Install k9s become: yes hosts: k9s diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 439c50e70..6063196c3 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -55,6 +55,16 @@ - import_playbook: extras.yml +# TODO: is this the right place? +- name: Install compute_init script + hosts: compute_init + tags: compute_init # tagged to allow running on cluster instances for dev + become: yes + tasks: + - include_role: + name: compute_init + tasks_from: install.yml + - hosts: builder become: yes gather_facts: yes diff --git a/ansible/roles/compute_init/tasks/main.yml b/ansible/roles/compute_init/tasks/main.yml deleted file mode 100644 index cb4c57d35..000000000 --- a/ansible/roles/compute_init/tasks/main.yml +++ /dev/null @@ -1,142 +0,0 @@ ---- - -- name: Ensure directories exist - file: - path: "/etc/ansible-init/{{ item }}" - state: directory - owner: root - group: root - mode: 0755 - loop: - - templates - - files - - library - - filter_plugins - -- name: Inject templates - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/templates/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../resolv_conf/templates/resolv.conf.j2 - - ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 - - ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 - -- name: Inject files - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/files/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../resolv_conf/files/NetworkManager-dns-none.conf - -- name: Inject libraries - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/library/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../basic_users/library/terminate_user_sessions.py - - ../../stackhpc.os-manila-mount/library/os_manila_share.py - - ../../stackhpc.openhpc/library/sacct_cluster.py - -- name: Inject filter_plugins - copy: - src: '{{ item }}' - dest: '/etc/ansible-init/filter_plugins/{{ item | basename }}' - owner: root - group: root - mode: 0644 - loop: - - ../../basic_users/filter_plugins/filter_keys.py - - ../../stackhpc.openhpc/filter_plugins/slurm_conf.py - -- name: Add filter_plugins ansible.cfg - lineinfile: - path: /etc/ansible-init/ansible.cfg - line: "filter_plugins = /etc/ansible-init/filter_plugins" - state: present - owner: root - group: root - mode: 0644 - -- name: Ensure nfs /exports/cluster configured - block: - - name: Ensure the /exports/cluster directory exists - file: - path: /exports/cluster - state: directory - owner: root - group: root - mode: 0755 - - - name: Copy /etc/hosts to /exports/cluster - copy: - src: /etc/hosts - dest: /exports/cluster/hosts - owner: root - group: root - mode: 0644 - remote_src: true - - - name: Copy manila share info to /exports/cluster - copy: - content: "{{ os_manila_mount_share_info | to_nice_yaml }}" - dest: "/exports/cluster/manila_share_info.yml" - when: os_manila_mount_share_info is defined - - - name: Copy manila mount shares to /exports/cluster - copy: - content: "{{ os_manila_mount_shares | to_nice_yaml }}" - dest: "/exports/cluster/manila_shares.yml" - when: os_manila_mount_shares is defined - - - name: Ensure /exports/cluster/cvmfs directory exists - file: - path: /exports/cluster/cvmfs - state: directory - owner: root - group: root - mode: 0755 - - - name: Copy EESSI CVMFS config to /exports/cluster - copy: - src: /etc/cvmfs/default.local - dest: /exports/cluster/cvmfs/default.local - owner: root - group: root - mode: 0644 - remote_src: true - - - name: Write openhpc munge key - copy: - content: "{{ vault_openhpc_mungekey | b64decode }}" - dest: "/exports/cluster/openhpc_munge.key" - owner: munge - group: munge - mode: 0400 - - # - name: Ensure /exports/cluster/inventory_hostname directory exists - # file: - # path: /exports/cluster/{{ inventory_hostname }} - # state: directory - # owner: root - # group: root - # mode: 0755 - - # - name: Template hostvars - # template: - # src: ../templates/hostvars.j2 - # dest: "/exports/cluster/{{ inventory_hostname }}/hostvars.yml" - # owner: root - # group: root - # mode: 0644 - - delegate_to: "{{ groups['control'] | first }}" diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index efc0cdcd9..64a67125e 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -1,15 +1,47 @@ +# compute-init + +TODO: describe current status. + +# Development To develop/debug this without actually having to build an image: -On deploy host: - .stackhpc/ (venv) [rocky@steveb-dev slurm-app-rl9]$ ansible-playbook ansible/extras.yml --tags compute_init +1. Add the compute nodes into the `compute_init` group: -On compute node: + cat <> $APPLIANCES_ENVIRONMENT_ROOT/inventory/extra_groups + [compute_init:children] + compute + EOF - [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init - [root@rl9-compute-0 rocky]# systemctl status ansible-init +2. Deploy a cluster using tofu and ansible/site.yml as normal. This will + additionally configure the control node to export compute hosts over NFS. + Check the cluster is up. + +3. Reimage the compute nodes: + + ansible-playbook --limit compute ansible/adhoc/rebuild + +4. Add metadata to a compute node e.g. via Horzon to turn on compute-init + playbook functionality. + +5. Fake an image build to deploy the compute-init playbook: + + ansible-playbook ansible/fatimage.yml --tags compute_init + +6. Fake a reimage of compute to run ansible-init and the compute-init playbook: + + On compute node where metadata was added: + + [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init + [root@rl9-compute-0 rocky]# systemctl status ansible-init + + Use `systemctl status ansible-init` to view stdout/stderr from Ansible. + +Steps 5/6 can be repeated with changes to the compute script. If desirable +reimage the compute node(s) first as in step 3. +# Results/progress Without any metadata: diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index 84371c99a..e9366da2b 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -20,4 +20,4 @@ nfs_configurations: nfs_enable: server: "{{ inventory_hostname in groups['control'] }}" clients: false - nfs_export: "/exports/cluster" # control node has to copy in /etc/hosts to here + nfs_export: "/exports/cluster" From cb21e9cf5329598fdbfc048e4aae2f3154b75c41 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Sat, 14 Dec 2024 21:47:29 +0000 Subject: [PATCH 085/182] leave compute-init turned on in everything template --- docs/experimental/compute-init.md | 19 ++++++------------- environments/common/inventory/groups | 2 +- environments/common/layouts/everything | 1 - 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index 64a67125e..fe0fe5df4 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -7,29 +7,22 @@ TODO: describe current status. To develop/debug this without actually having to build an image: -1. Add the compute nodes into the `compute_init` group: - - cat <> $APPLIANCES_ENVIRONMENT_ROOT/inventory/extra_groups - [compute_init:children] - compute - EOF - -2. Deploy a cluster using tofu and ansible/site.yml as normal. This will +1. Deploy a cluster using tofu and ansible/site.yml as normal. This will additionally configure the control node to export compute hosts over NFS. Check the cluster is up. -3. Reimage the compute nodes: +2. Reimage the compute nodes: ansible-playbook --limit compute ansible/adhoc/rebuild -4. Add metadata to a compute node e.g. via Horzon to turn on compute-init +3. Add metadata to a compute node e.g. via Horzon to turn on compute-init playbook functionality. -5. Fake an image build to deploy the compute-init playbook: +4. Fake an image build to deploy the compute-init playbook: ansible-playbook ansible/fatimage.yml --tags compute_init -6. Fake a reimage of compute to run ansible-init and the compute-init playbook: +5. Fake a reimage of compute to run ansible-init and the compute-init playbook: On compute node where metadata was added: @@ -38,7 +31,7 @@ To develop/debug this without actually having to build an image: Use `systemctl status ansible-init` to view stdout/stderr from Ansible. -Steps 5/6 can be repeated with changes to the compute script. If desirable +Steps 4/5 can be repeated with changes to the compute script. If desirable reimage the compute node(s) first as in step 3. # Results/progress diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index ba846777c..b944cccd6 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -137,7 +137,7 @@ freeipa_client # Hosts to run linux-anisble-init [compute_init] -# Hosts to deploy compute initialisation ansible-init script to. +# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on [k3s] # Hosts to run k3s server/agent diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 0fc447cf5..f65e14fe6 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -84,7 +84,6 @@ cluster [compute_init:children] # EXPERIMENTAL: Compute hosts to enable joining cluster on boot on -# TODO: actually should be empty for now compute [k3s:children] From 53a7dc4fbb76abf11455d56c05df3b1701a91a8e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Sat, 14 Dec 2024 22:36:50 +0000 Subject: [PATCH 086/182] get resolv_conf, etc_hosts and stackhpc.openhpc working --- .../roles/compute_init/files/compute-init.yml | 97 +++++++++++++++++-- 1 file changed, 88 insertions(+), 9 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 5661b467d..24a5090c6 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -6,7 +6,9 @@ vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" server_node_ip: "{{ os_metadata.meta.k3s_server }}" - compute_groups: "{{ os_metadata.meta.compute_groups | default([]) }}" + enable_slurmd: "{{ os_metadata.meta.enable_slurmd | default(false) | bool }}" + enable_resolv_conf: "{{ os_metadata.meta.enable_senable_resolv_conf | default(false) | bool }}" + enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects # this is a good example: common environment actually defines this (non-functional w/o compute groups), but role default is empty @@ -17,10 +19,10 @@ - name: Report skipping initialization if not compute node # meta: end_play produces no output debug: - msg: "Skipping compute initialization as metadata compute_groups is empty" + msg: "Skipping compute initialization: Metadata enable_slurmd is not true" - meta: end_play - when: compute_groups | length == 0 + when: not enable_slurmd - name: Ensure the mount directory exists file: @@ -37,16 +39,15 @@ fstype: nfs opts: ro,sync state: mounted - register: nfs_mount_result - ignore_errors: true register: _mount_mnt_cluster + ignore_errors: true # TODO: add some retries here? - block: - name: Report skipping initialization if cannot mount nfs # meta: end_play produces no output debug: - msg: "Skipping compute initialization as cannot mount exports/cluster share" + msg: "Skipping compute initialization: Failed to mount /exports/cluster from control node {{ server_node_ip }}" - meta: end_play when: _mount_mnt_cluster.failed @@ -56,6 +57,84 @@ include_vars: file: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" # can't use inventory_hostname - - name: Demonstrate hostvars have loaded - debug: - var: prometheus_version + # TODO: should /mnt/cluster now be UNMOUNTED to avoid future hang-ups? + + - name: Configure resolve.conf + block: + - name: Set nameservers in /etc/resolv.conf + ansible.builtin.template: + src: /etc/ansible-init/templates/resolv.conf.j2 + dest: /etc/resolv.conf + owner: root + group: root + mode: u=rw,og=r + + - name: Disable NetworkManager control of resolv.conf + ansible.builtin.copy: + src: /etc/ansible-init/files/NetworkManager-dns-none.conf + dest: /etc/NetworkManager/conf.d/90-dns-none.conf + owner: root + group: root + mode: u=rw,og=r + register: _copy_nm_config + + - name: Reload NetworkManager + ansible.builtin.systemd: + name: NetworkManager + state: reloaded + when: _copy_nm_config.changed | default(false) + when: enable_resolv_conf + + - name: Copy cluster /etc/hosts + copy: + src: /mnt/cluster/hosts + dest: /etc/hosts + owner: root + group: root + mode: 0644 + when: enable_etc_hosts + + # TODO: - name: NFS client mount + + # TODO: - name: Manila mount + + # TODO: - name: Basic users setup + + # TODO: - name: Configure EESSI + + # TODO: - name: Configure openhpc + # NB: don't need conditional block on enable_slurmd as have already exited + # if not the case + - name: Write Munge key + copy: + content: "{{ openhpc_munge_key }}" + dest: "/etc/munge/munge.key" + owner: munge + group: munge + mode: 0400 + + - name: Set slurmctld location for configless operation + lineinfile: + path: /etc/sysconfig/slurmd + line: "SLURMD_OPTIONS='--conf-server {{ server_node_ip }}'" + regexp: "^SLURMD_OPTIONS=" + create: yes + owner: root + group: root + mode: 0644 + + - name: Ensure Munge service state + service: + name: munge + enabled: true + state: started + + - name: Ensure slurmd service state + service: + name: slurmd + enabled: true + state: started + + - name: Ensure node is resumed + # TODO: consider if this is always safe for all job states? + command: scontrol update state=resume nodename={{ ansible_hostname }} From 1f458516f46d223cb024447b59dfeca6b7cfcacb Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Sat, 14 Dec 2024 22:38:12 +0000 Subject: [PATCH 087/182] doc problems with templating out hostvars --- docs/experimental/compute-init.md | 53 ++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index fe0fe5df4..17ed370e6 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -1,6 +1,9 @@ # compute-init -TODO: describe current status. +The following roles are currently functional: +- resolv_conf +- etc_hosts +- stackhpc.openhpc # Development @@ -8,7 +11,7 @@ To develop/debug this without actually having to build an image: 1. Deploy a cluster using tofu and ansible/site.yml as normal. This will - additionally configure the control node to export compute hosts over NFS. + additionally configure the control node to export compute hostvars over NFS. Check the cluster is up. 2. Reimage the compute nodes: @@ -22,6 +25,10 @@ To develop/debug this without actually having to build an image: ansible-playbook ansible/fatimage.yml --tags compute_init + NB: This will also re-export the compute hostvars, as the nodes are not + in the builder group, which conveniently means any changes made to that + play also get picked up. + 5. Fake a reimage of compute to run ansible-init and the compute-init playbook: On compute node where metadata was added: @@ -31,8 +38,9 @@ To develop/debug this without actually having to build an image: Use `systemctl status ansible-init` to view stdout/stderr from Ansible. -Steps 4/5 can be repeated with changes to the compute script. If desirable -reimage the compute node(s) first as in step 3. +Steps 4/5 can be repeated with changes to the compute script. If required, +reimage the compute node(s) first as in step 2 and/or add additional metadata +as in step 3. # Results/progress @@ -144,3 +152,40 @@ This commit - shows that hostvars have loaded: Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] ansible-init completed successfully Dec 13 21:06:20 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. +# Design notes + +- In general, we don't want to rely on NFS export. So should e.g. copy files + from this mount ASAP in the compute-init script. TODO: +- There are a few possible approaches: + + 1. Control node copies files resulting from role into cluster exports, + compute-init copies to local disk. Only works if files are not host-specific + Examples: etc_hosts, eessi config? + + 2. Re-implement the role. Works if the role vars are not too complicated, + (else they all need to be duplicated in compute-init). Could also only + support certain subsets of role functionality or variables + Examples: resolv_conf, stackhpc.openhpc + + +# Problems with templated hostvars + +Here are all the ones which actually rely on hostvars from other nodes, +which therefore aren't available: + +``` +[root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml + "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", + "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", + "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", + "nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}", + "openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}", + "openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}", + "openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}", + "openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}", + "prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}", + "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" +``` + +More generally, there is nothing to stop any group var depending on a +"{{ hostvars[] }}" interpolation ... From c162e18e447410487dd481abcf982477ac8b39b4 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 09:53:52 +0000 Subject: [PATCH 088/182] Refactored common repolist --- ansible/roles/dnf_repos/defaults/main.yml | 10 ++++----- ansible/roles/pulp_site/defaults/main.yml | 22 +++++++++---------- .../inventory/group_vars/all/defaults.yml | 21 +++++++++--------- 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 281a57c7e..4a0c9fd2a 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -8,16 +8,16 @@ dnf_repos_password: "{{ omit }}" dnf_repos_repolist: - file: rocky name: baseos - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/BaseOS/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].baseos }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/BaseOS/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.baseos[ansible_distribution_version] }}" - file: rocky name: appstream - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/AppStream/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].appstream }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/AppStream/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.appstream[ansible_distribution_version] }}" - file: rocky name: crb - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/CRB/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].crb }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/CRB/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.crb[ansible_distribution_version] }}" - file: rocky-extras name: extras - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ ansible_architecture }}/os/{{ appliances_repo_minor_timestamps[ansible_distribution_version].extras }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.extras[ansible_distribution_version] }}" -dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ appliances_repo_major_timestamps[ansible_distribution_major_version].epel }}" +dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ appliances_repo_timestamps.epel[ansible_distribution_major_version] }}" dnf_repos_epel_description: "epel" diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index 76ad14988..2c90d2968 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -15,20 +15,18 @@ pulp_site_target_arch: "{{ pulp_site_target_facts['architecture'] }}" pulp_site_target_distribution: "{{ pulp_site_target_facts['distribution'] | lower }}" pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" -pulp_site_version_timestamps: "{{ appliances_repo_minor_timestamps[pulp_site_target_distribution_version] }}" -pulp_site_major_version_timestamps: "{{ appliances_repo_major_timestamps[pulp_site_target_distribution_version_major] }}" pulp_site_rpm_info: -- name: "baseos-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.baseos }}" - subpath: "{{ _pulp_site_rocky_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.baseos }}" -- name: "appstream-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.appstream }}" - subpath: "{{ _pulp_site_rocky_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.appstream }}" -- name: "crb-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.crb }}" - subpath: "{{ _pulp_site_rocky_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.crb }}" -- name: "extras-{{ pulp_site_target_distribution_version }}-{{ pulp_site_version_timestamps.extras }}" - subpath: "{{ _pulp_site_rocky_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ pulp_site_version_timestamps.extras }}" -- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ pulp_site_major_version_timestamps.epel }}" - subpath: "epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ pulp_site_major_version_timestamps.epel }}" +- name: "baseos-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.baseos[pulp_site_target_distribution_version] }}" + subpath: "{{ _pulp_site_rocky_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.baseos[pulp_site_target_distribution_version] }}" +- name: "appstream-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.appstream[pulp_site_target_distribution_version] }}" + subpath: "{{ _pulp_site_rocky_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.appstream[pulp_site_target_distribution_version] }}" +- name: "crb-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.crb[pulp_site_target_distribution_version] }}" + subpath: "{{ _pulp_site_rocky_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.crb[pulp_site_target_distribution_version] }}" +- name: "extras-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.extras[pulp_site_target_distribution_version] }}" + subpath: "{{ _pulp_site_rocky_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.extras[pulp_site_target_distribution_version] }}" +- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ appliances_repo_timestamps.epel[pulp_site_target_distribution_version_major] }}" + subpath: "epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ appliances_repo_timestamps.epel[pulp_site_target_distribution_version_major] }}" pulp_site_rpm_repo_defaults: remote_username: "{{ pulp_site_upstream_username }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index e1acdf19b..1bac4590d 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -82,13 +82,14 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us ########################################################################################### -appliances_repo_minor_timestamps: - '9.4': - baseos: 20240816T002610 - appstream: 20240816T002610 - crb: 20240816T002610 - extras: 20240816T002610 - -appliances_repo_major_timestamps: - '9': - epel: 20240902T080424 +appliances_repo_timestamps: + baseos: + '9.4': 20240816T002610 + appstream: + '9.4': 20240816T002610 + crb: + '9.4': 20240816T002610 + extras: + '9.4': 20240816T002610 + epel: + '9': 20240902T080424 From bda3f0d5ad31e1c9e2faf1d6cbdfa0b293ce76c8 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 16 Dec 2024 10:09:04 +0000 Subject: [PATCH 089/182] Code review doc/comment suggestions Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/adhoc/deploy-pulp.yml | 1 - docs/experimental/pulp.md | 4 ++-- environments/.stackhpc/inventory/group_vars/builder.yml | 2 +- environments/common/inventory/groups | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml index 38cb79289..2858d032b 100644 --- a/ansible/adhoc/deploy-pulp.yml +++ b/ansible/adhoc/deploy-pulp.yml @@ -11,7 +11,6 @@ become: yes hosts: _pulp_host tasks: - - name: Install pulp ansible.builtin.include_role: name: pulp_site diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index d1a40ba52..d2bc0db72 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -1,13 +1,13 @@ # Pulp Server -In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's Ark Pulp server. The appliance will sync relevant repositories to local Pulp server which will be used for image builds. Using a local server can be enabled by adding `pulp` to the build groups and overriding `dnf_repos_repolist` to point at content hosted on the local server. +In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's "Ark" Pulp server. The appliance can sync relevant repositories to a local Pulp server which will then be used instead of Ark. Using a local Pulp can be enabled by adding `pulp` to the build groups and overriding `dnf_repos_repolist` to point at content hosted on the local server. ## Deploying/configuring Pulp Server ### Deploying a Pulp server A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml' playbook. This can be run with `ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server="` -This will print a Pulp endpoint which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note that this server's content isn't authenticated so assumes the server is deployed behind a secure network. +This will print a Pulp endpoint which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note access to this server's content isn't authenticated so assumes the server is deployed behind a secure network. ### Using an existing Pulp server An existing Pulp server can be used to host Ark repos by overriding `pulp_site_password` and `appliances_pulp_url` in the target environment. Note that this assumes the same configuration as the appliance deployed pulp i.e no content authentication. diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index ce1666973..8d4c8b3bb 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -9,6 +9,6 @@ # appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" # pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" - +# Alternatively, configure to use ark directly: dnf_repos_username: slurm-app-ci dnf_repos_password: "{{ lookup('env','ARK_PASSWORD') }}" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 8f52477cd..d49f3d6c1 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -150,4 +150,4 @@ freeipa_client builder [pulp:children] -# Hosts used to run Pulp API commands +# Add builder to this group to enable automatically syncing of pulp during image build From bc5e26efe139b50296cb9cd2a1fa47f98a9fecc7 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 10:51:18 +0000 Subject: [PATCH 090/182] docs/groups corrections --- docs/experimental/pulp.md | 6 +++--- environments/.stackhpc/inventory/group_vars/builder.yml | 1 + environments/common/inventory/groups | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index d2bc0db72..8c9bfd615 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -1,13 +1,13 @@ # Pulp Server -In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's "Ark" Pulp server. The appliance can sync relevant repositories to a local Pulp server which will then be used instead of Ark. Using a local Pulp can be enabled by adding `pulp` to the build groups and overriding `dnf_repos_repolist` to point at content hosted on the local server. +In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's "Ark" Pulp server. The appliance can sync relevant repositories to a local Pulp server which will then be used instead of Ark. Using a local Pulp can be enabled by adding `pulp` to the build groups and overriding `appliances_pulp_url` to point at the local Pulp's URL. ## Deploying/configuring Pulp Server ### Deploying a Pulp server A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml' playbook. This can be run with -`ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server="` -This will print a Pulp endpoint which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note access to this server's content isn't authenticated so assumes the server is deployed behind a secure network. +`ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server="` +where `target_host` is any resolvable host. This will print a Pulp URL which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note access to this server's content isn't authenticated so assumes the server is deployed behind a secure network. ### Using an existing Pulp server An existing Pulp server can be used to host Ark repos by overriding `pulp_site_password` and `appliances_pulp_url` in the target environment. Note that this assumes the same configuration as the appliance deployed pulp i.e no content authentication. diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 8d4c8b3bb..b12e81826 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -9,6 +9,7 @@ # appliances_pulp_url: "{{ pulp_server_config[lookup('env','CI_CLOUD')].url }}" # pulp_site_password: "{{ pulp_server_config[lookup('env','CI_CLOUD')].password }}" + # Alternatively, configure to use ark directly: dnf_repos_username: slurm-app-ci dnf_repos_password: "{{ lookup('env','ARK_PASSWORD') }}" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index d49f3d6c1..6f77eeab5 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -149,5 +149,6 @@ freeipa_client # Hosts to replace system repos with Pulp repos builder -[pulp:children] +[pulp] # Add builder to this group to enable automatically syncing of pulp during image build +# Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` risks leaking Ark creds From 18b220e1b54d991946ebba4dbd386ed96f392993 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 11:42:49 +0000 Subject: [PATCH 091/182] moved defaults to CI and updated docs --- ansible/roles/pulp_site/defaults/main.yml | 4 +--- docs/experimental/pulp.md | 2 +- docs/image-build.md | 9 +++++---- .../.stackhpc/inventory/group_vars/builder.yml | 4 ++++ environments/common/inventory/group_vars/all/pulp.yml | 10 ++++++++++ 5 files changed, 21 insertions(+), 8 deletions(-) diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index 2c90d2968..d343d4998 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -1,12 +1,10 @@ pulp_site_url: "{{ appliances_pulp_url }}" pulp_site_port: 8080 pulp_site_username: admin # shouldn't be changed +pulp_site_password: "{{ vault_pulp_admin_password }}" pulp_site_upstream_content_url: https://ark.stackhpc.com/pulp/content -pulp_site_upstream_username: slurm-app-ci -pulp_site_upstream_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" _pulp_site_rocky_prefix: "{{ pulp_site_target_distribution }}/{{ pulp_site_target_distribution_version }}" pulp_site_default_upstream_suffix: "{{ pulp_site_target_arch }}/os" -pulp_site_password: "{{ vault_pulp_admin_password }}" pulp_site_validate_certs: false pulp_site_install_dir: '/home/rocky/pulp' pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index 8c9bfd615..e0f32cdc1 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -14,4 +14,4 @@ An existing Pulp server can be used to host Ark repos by overriding `pulp_site_p ## Syncing Pulp content with Ark -If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must supply your Ark credentials, either by overriding `pulp_site_upstream_password` or setting environment variable `ARK_PASSWORD`. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`, optionally setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version`. +If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must authenticate with Ark by overriding `pulp_site_upstream_username` and `pulp_site_upstream_password` with your vault encrypted Ark dev credentials. `dnf_repos_username` and `dnf_repos_password` must remain unset to access content from the local Pulp. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`. By default this syncs repositories for Rocky 9.4 with x86_64 architecture, but can be overriden by setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version_major`. diff --git a/docs/image-build.md b/docs/image-build.md index a7d2e951b..db51265a3 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -17,7 +17,8 @@ The fat images StackHPC builds and tests in CI are available from [GitHub releas To build either a site-specific fat image from scratch, or to extend an existing StackHPC fat image: 1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). -2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum: +2. The provided dev credentials for StackHPC's "Ark" Pulp server must be added to the target environments. This is done by overriding `dnf_repos_username` and `dnf_repos_password` with your vault encrypted credentials in `environments//inventory/group_vars/all/pulp.yml`. See the [experimental docs](experimental/pulp.md) if you wish instead wish to use a local Pulp server. +3. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum: ```hcl flavor = "general.v1.small" # VM flavor to use for builder VMs @@ -35,9 +36,9 @@ To build either a site-specific fat image from scratch, or to extend an existing - `update,control,login,compute`: The resultant image has all packages in the source image updated, and then packages for all types of nodes in the cluster are added. When using a GenericCloud image for `source_image_name` this builds a site-specific fat image from scratch. - One or more specific groups which are not enabled in the appliance by default, e.g. `lustre`. When using a StackHPC fat image for `source_image_name` this extends the image with just this additional functionality. -3. Activate the venv and the relevant environment. +4. Activate the venv and the relevant environment. -4. Build images using the relevant variable definition file, e.g.: +5. Build images using the relevant variable definition file, e.g.: cd packer/ PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl @@ -52,7 +53,7 @@ To build either a site-specific fat image from scratch, or to extend an existing then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). -5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. +6. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. # Build Process diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index b12e81826..5130e9d84 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -13,3 +13,7 @@ # Alternatively, configure to use ark directly: dnf_repos_username: slurm-app-ci dnf_repos_password: "{{ lookup('env','ARK_PASSWORD') }}" + +# Can be set regardless of approach above: +pulp_site_upstream_username: slurm-app-ci +pulp_site_upstream_password: "{{ lookup('ansible.builtin.env', 'ARK_PASSWORD') }}" diff --git a/environments/common/inventory/group_vars/all/pulp.yml b/environments/common/inventory/group_vars/all/pulp.yml index 02b7aa816..22bb83216 100644 --- a/environments/common/inventory/group_vars/all/pulp.yml +++ b/environments/common/inventory/group_vars/all/pulp.yml @@ -1 +1,11 @@ pulp_site_port: 8080 + +# If using Ark directly (no local Pulp server), override the following with Ark creds + +# dnf_repos_username: +# dnf_repos_password: + +# If instead using local Pulp server, override below with Ark creds + +# pulp_site_upstream_username: +# pulp_site_upstream_password: From 34fee1cb17a32c4d0fc52cb2997e8f1c6458f730 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 12:26:27 +0000 Subject: [PATCH 092/182] updated docs --- docs/operations.md | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/docs/operations.md b/docs/operations.md index a20d7f10c..50eef9053 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -63,17 +63,28 @@ This is a usually a two-step process: Deploying the additional nodes and applying these changes requires rerunning both Terraform and the Ansible site.yml playbook - follow [Deploying a Cluster](#Deploying-a-Cluster). # Adding Additional Packages -Packages from any enabled DNF repositories (which always includes EPEL, PowerTools and OpenHPC) can be added to all nodes by defining a list `openhpc_packages_extra` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/openhpc.yml`. For example: - - # environments/foo-base/inventory/group_vars/all/openhpc.yml: - openhpc_packages_extra: +By default, the following utility packages are installed during build: +- htop +- nano +- screen +- tmux +- wget +- bind-utils +- net-tools +- postfix +- git +- latest python version for system (3.6 for for Rocky 8.9 and 3.12 for Rocky 9.4) +Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_other_extra_package` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: + + # environments/foo-base/inventory/group_vars/all/defaults.yml: + appliances_other_extra_package: - somepackage - anotherpackage The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the [OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note "user-facing" OpenHPC packages such as compilers, mpi libraries etc. include corresponding `lmod` modules. -To add these packages to the current cluster, run the same command as for [Reconfiguring Slurm](#Reconfiguring-Slurm). TODO: describe what's required to add these to site-specific images. +If you wish to install packages during runtime, the `site.yml` playbook should be run `appliances_packages_during_configure` overriden to `true` and `cluster` should be added as a child of the `dnf_repos` group in order to temporarily re-enabled DNF repositories during runtime (WARNING: this should only be done if using an unauthenticated local Pulp server. If using StackHPC Ark directly, doing this WILL leak credentials to users). If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a plat should NOT exclude the builder group, so that the repositories are also added to built images. There are various Ansible modules which might be useful for this: - `ansible.builtin.yum_repository`: Add a repo from an URL providing a 'repodata' directory. From 9c41725c64d1a14bd4247bbd7c06daa4835e4240 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 16 Dec 2024 12:29:58 +0000 Subject: [PATCH 093/182] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 8659f3e90..989b9f9bb 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241213-1402-a2a705c9", - "RL9": "openhpc-RL9-241213-1402-a2a705c9" + "RL8": "openhpc-RL8-241216-1146-18b220e1", + "RL9": "openhpc-RL9-241216-1146-18b220e1" } } From a4352920dcd0dd1ccb9ab798b44a75aa4d2a1ec9 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 16 Dec 2024 13:42:52 +0000 Subject: [PATCH 094/182] bump image --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 8659f3e90..44059d97c 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241213-1402-a2a705c9", - "RL9": "openhpc-RL9-241213-1402-a2a705c9" + "RL8": "openhpc-RL8-241216-1231-83161c73", + "RL9": "openhpc-RL9-241216-1232-83161c73" } } From 30a278ee64e7202b8ec7a3da9753d81c5c7fd42d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 15:37:12 +0000 Subject: [PATCH 095/182] moved to extras --- ansible/extras.yml | 10 ++++++++++ ansible/packages.yml | 10 ---------- 2 files changed, 10 insertions(+), 10 deletions(-) delete mode 100644 ansible/packages.yml diff --git a/ansible/extras.yml b/ansible/extras.yml index 107f85252..ea5c8eb12 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -44,3 +44,13 @@ tasks: - import_role: name: k9s + +- hosts: extra_packages + become: yes + tags: + - extra_packages + tasks: + - name: Install additional packages + dnf: + name: "{{ appliances_extra_packages }}" + when: appliances_mode != 'configure' or appliances_packages_during_configure diff --git a/ansible/packages.yml b/ansible/packages.yml deleted file mode 100644 index e447dcda7..000000000 --- a/ansible/packages.yml +++ /dev/null @@ -1,10 +0,0 @@ - -- hosts: extra_packages - become: yes - tags: - - extra_packages - tasks: - - name: Install additional packages - dnf: - name: "{{ appliances_extra_packages }}" - when: appliances_mode != 'configure' or appliances_packages_during_configure From 6c74a1e15fcfe809b28a3bd7d5bc582b90175105 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 15:51:29 +0000 Subject: [PATCH 096/182] repos now controlled by groups + possible during configure + guarded against cred leaks --- ansible/bootstrap.yml | 14 ++++++++++++++ ansible/disable-repos.yml | 8 ++++++++ ansible/fatimage.yml | 18 +----------------- ansible/site.yml | 1 + environments/common/inventory/groups | 2 +- 5 files changed, 25 insertions(+), 18 deletions(-) create mode 100644 ansible/disable-repos.yml diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 733d4b3f8..a504f3545 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -110,6 +110,20 @@ policy: "{{ selinux_policy }}" register: sestatus +- hosts: dnf_repos + become: yes + tasks: + - name: Check that creds won't be leaked to users + ansible.builtin.assert: + that: dnf_repos_password is undefined + fail_msg: Passwords should not be templated into repofiles during configure, unset 'dnf_repos_password' + when: appliances_mode == 'configure' + - name: Replace system repos with pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: set_repos.yml + when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided + # --- tasks after here require access to package repos --- - hosts: squid tags: squid diff --git a/ansible/disable-repos.yml b/ansible/disable-repos.yml new file mode 100644 index 000000000..d7dc4fd55 --- /dev/null +++ b/ansible/disable-repos.yml @@ -0,0 +1,8 @@ +- hosts: dnf_repos + become: yes + tasks: + - name: Disable pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: disable_repos.yml + when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 5d84fcf90..4c8367816 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -27,15 +27,6 @@ delegate_to: localhost when: appliances_mode != 'configure' -- hosts: dnf_repos - become: yes - tasks: - - name: Replace system repos with pulp repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: set_repos.yml - when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided - - import_playbook: bootstrap.yml - name: Run post-bootstrap.yml hook @@ -229,14 +220,7 @@ import_role: name: doca -- hosts: dnf_repos - become: yes - tasks: - - name: Disable pulp repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: disable_repos.yml - when: appliances_mode != 'configure' and ansible_distribution_major_version == "9" #TODO update role once RL8 config decided +- import_playbook: disable_repos.yml - name: Run post.yml hook vars: diff --git a/ansible/site.yml b/ansible/site.yml index bb379399d..222ee8697 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -27,6 +27,7 @@ - import_playbook: slurm.yml - import_playbook: portal.yml - import_playbook: monitoring.yml +- import_playbook: disable_repos.yml - name: Run post.yml hook vars: diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 6f77eeab5..062276f76 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -147,8 +147,8 @@ freeipa_client [dnf_repos:children] # Hosts to replace system repos with Pulp repos +# Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users builder [pulp] # Add builder to this group to enable automatically syncing of pulp during image build -# Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` risks leaking Ark creds From 2357a730d060ad43289d022024de118093984017 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 16 Dec 2024 15:58:58 +0000 Subject: [PATCH 097/182] typo --- ansible/fatimage.yml | 2 +- ansible/site.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 4c8367816..55e56e612 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -220,7 +220,7 @@ import_role: name: doca -- import_playbook: disable_repos.yml +- import_playbook: disable-repos.yml - name: Run post.yml hook vars: diff --git a/ansible/site.yml b/ansible/site.yml index 222ee8697..d973d9cb3 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -27,7 +27,7 @@ - import_playbook: slurm.yml - import_playbook: portal.yml - import_playbook: monitoring.yml -- import_playbook: disable_repos.yml +- import_playbook: disable-repos.yml - name: Run post.yml hook vars: From bf6f3680ec49906cc48b170b003c67627e62aca4 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:59:43 +0000 Subject: [PATCH 098/182] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 989b9f9bb..7c59abf36 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241216-1146-18b220e1", - "RL9": "openhpc-RL9-241216-1146-18b220e1" + "RL8": "openhpc-RL8-241216-1607-2357a730", + "RL9": "openhpc-RL9-241216-1607-2357a730" } } From c6a6bf365e74c95e2079ce5f73753f4285b3d95b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 17 Dec 2024 11:33:31 +0000 Subject: [PATCH 099/182] re-enable CI on compute-init script branch --- .github/workflows/stackhpc.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 848517bb8..b08854adb 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -24,8 +24,6 @@ on: - '!.gitignore' - '!.github/workflows/' - '.github/workflows/stackhpc' - branches: - - '!feat/compute-script' jobs: openstack: name: openstack-ci From 5455eec66a243a295f7651b23eafb3ea52bfb65c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 17 Dec 2024 11:37:00 +0000 Subject: [PATCH 100/182] doc compute_init/export.yml ordering --- ansible/extras.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ansible/extras.yml b/ansible/extras.yml index 10ca4cc9d..ad538c58a 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -37,8 +37,10 @@ - import_role: name: persist_hostkeys -# TODO: Is this is the right place? -- hosts: compute_init:!builder + +- name: Setup NFS export for compute node configuration + hosts: compute_init:!builder + # NB: has to be after eeesi and os-manila-mount tags: compute_init become: yes name: Export hostvars From 36cf771cae4d7706d8b99308ba7f028b6d279472 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 17 Dec 2024 11:39:45 +0000 Subject: [PATCH 101/182] change name for compute-init enablement --- ansible/roles/compute_init/files/compute-init.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 24a5090c6..74face5e1 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -6,7 +6,7 @@ vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" server_node_ip: "{{ os_metadata.meta.k3s_server }}" - enable_slurmd: "{{ os_metadata.meta.enable_slurmd | default(false) | bool }}" + enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}" enable_resolv_conf: "{{ os_metadata.meta.enable_senable_resolv_conf | default(false) | bool }}" enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" @@ -19,10 +19,10 @@ - name: Report skipping initialization if not compute node # meta: end_play produces no output debug: - msg: "Skipping compute initialization: Metadata enable_slurmd is not true" + msg: "Skipping compute initialization: Metadata enable_compute is not true" - meta: end_play - when: not enable_slurmd + when: not enable_compute - name: Ensure the mount directory exists file: @@ -103,7 +103,7 @@ # TODO: - name: Configure EESSI # TODO: - name: Configure openhpc - # NB: don't need conditional block on enable_slurmd as have already exited + # NB: don't need conditional block on enable_compute as have already exited # if not the case - name: Write Munge key copy: From 5e7f809276ffd1c259e5d8ac19b61abd67be6b21 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 17 Dec 2024 12:05:17 +0000 Subject: [PATCH 102/182] move most compute-init docs to the role readme --- ansible/roles/compute_init/README.md | 119 +++++++++++++++++++++++++++ docs/experimental/compute-init.md | 80 +----------------- 2 files changed, 120 insertions(+), 79 deletions(-) create mode 100644 ansible/roles/compute_init/README.md diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md new file mode 100644 index 000000000..dac59e2d3 --- /dev/null +++ b/ansible/roles/compute_init/README.md @@ -0,0 +1,119 @@ +# EXPERIMENTAL: compute-init + +Experimental / in-progress functionality to allow compute nodes to rejoin the +cluster after a reboot. + +To enable this add compute nodes (or a subset of them into) the `compute_init` +group. + +This works as follows: +1. During image build, an ansible-init playbook and supporting files +(e.g. templates, filters, etc) are installed. +2. Cluster instances are created as usual; the above compute-init playbook does +not run. +3. The `site.yml` playbook is run as usual to configure all the instances into +a cluster. In addition, with `compute-init` enabled, a `/exports/cluster` NFS +share is created on the control node containing: + - an /etc/hosts file for the cluster + - Hostvars for each compute node +4. On reboot of a compute node, ansible-init runs the compute-init playbook +which: + a. Checks whether the `enable_compute` metadata flag is set, and exits if + not. + b. Tries to mount the above `/exports/cluster` NFS share from the control + node, and exits if it cannot. + c. Configures itself using the exported hostvars, depending on the + `enable_*` flags set in metadata. + d. Issues an `scontrol` command to resume the node (because Slurm will + consider it as "unexpectedly rebooted"). + +The check in 4b. above is what prevents the compute-init script from trying +to configure the node before the services on the control node are available +(which requires running the site.yml playbook). + +The following roles are currently fully functional: +- `resolv_conf` +- `etc_hosts` +- `stackhpc.openhpc` + +# Development/debugging + +To develop/debug this without actually having to build an image: + + +1. Deploy a cluster using tofu and ansible/site.yml as normal. This will + additionally configure the control node to export compute hostvars over NFS. + Check the cluster is up. + +2. Reimage the compute nodes: + + ansible-playbook --limit compute ansible/adhoc/rebuild + +3. Add metadata to a compute node e.g. via Horzon to turn on compute-init + playbook functionality. + +4. Fake an image build to deploy the compute-init playbook: + + ansible-playbook ansible/fatimage.yml --tags compute_init + + NB: This will also re-export the compute hostvars, as the nodes are not + in the builder group, which conveniently means any changes made to that + play also get picked up. + +5. Fake a reimage of compute to run ansible-init and the compute-init playbook: + + On compute node where metadata was added: + + [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init + [root@rl9-compute-0 rocky]# systemctl status ansible-init + + Use `systemctl status ansible-init` to view stdout/stderr from Ansible. + +Steps 4/5 can be repeated with changes to the compute script. If required, +reimage the compute node(s) first as in step 2 and/or add additional metadata +as in step 3. + + +# Design notes +- Duplicating code in roles into the `compute-init` script is unfortunate, but + does allow developing this functionality without wider changes to the + appliance. + +- In general, we don't want to rely on NFS export. So should e.g. copy files + from this mount ASAP in the compute-init script. TODO: + +- There are a couple of approaches to supporting existing roles using `compute-init`: + + 1. Control node copies files resulting from role into cluster exports, + compute-init copies to local disk. Only works if files are not host-specific + Examples: etc_hosts, eessi config? + + 2. Re-implement the role. Works if the role vars are not too complicated, + (else they all need to be duplicated in compute-init). Could also only + support certain subsets of role functionality or variables + Examples: resolv_conf, stackhpc.openhpc + +- Some hostvars are tempalted from hostvars from other nodes, which aren't + available in the current approach: + + ``` + [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml + "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", + "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", + "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", + "nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}", + "openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}", + "openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}", + "openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}", + "openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}", + "prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}", + "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" + ``` + + More generally, there is nothing to stop any group var depending on a + "{{ hostvars[] }}" interpolation ... + + Currently, the only functionality this has been problematic for is setting + the control node address for the slurmd node, which has been done using + the (metadata-provided) IP, given this is needed to do the NFS mount anyway + in the absence of working internal DNS. diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index 17ed370e6..dae840d95 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -1,46 +1,6 @@ # compute-init -The following roles are currently functional: -- resolv_conf -- etc_hosts -- stackhpc.openhpc - -# Development - -To develop/debug this without actually having to build an image: - - -1. Deploy a cluster using tofu and ansible/site.yml as normal. This will - additionally configure the control node to export compute hostvars over NFS. - Check the cluster is up. - -2. Reimage the compute nodes: - - ansible-playbook --limit compute ansible/adhoc/rebuild - -3. Add metadata to a compute node e.g. via Horzon to turn on compute-init - playbook functionality. - -4. Fake an image build to deploy the compute-init playbook: - - ansible-playbook ansible/fatimage.yml --tags compute_init - - NB: This will also re-export the compute hostvars, as the nodes are not - in the builder group, which conveniently means any changes made to that - play also get picked up. - -5. Fake a reimage of compute to run ansible-init and the compute-init playbook: - - On compute node where metadata was added: - - [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init - [root@rl9-compute-0 rocky]# systemctl status ansible-init - - Use `systemctl status ansible-init` to view stdout/stderr from Ansible. - -Steps 4/5 can be repeated with changes to the compute script. If required, -reimage the compute node(s) first as in step 2 and/or add additional metadata -as in step 3. +See the role README.md # Results/progress @@ -151,41 +111,3 @@ This commit - shows that hostvars have loaded: Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] writing sentinel file /var/lib/ansible-init.done Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] ansible-init completed successfully Dec 13 21:06:20 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. - -# Design notes - -- In general, we don't want to rely on NFS export. So should e.g. copy files - from this mount ASAP in the compute-init script. TODO: -- There are a few possible approaches: - - 1. Control node copies files resulting from role into cluster exports, - compute-init copies to local disk. Only works if files are not host-specific - Examples: etc_hosts, eessi config? - - 2. Re-implement the role. Works if the role vars are not too complicated, - (else they all need to be duplicated in compute-init). Could also only - support certain subsets of role functionality or variables - Examples: resolv_conf, stackhpc.openhpc - - -# Problems with templated hostvars - -Here are all the ones which actually rely on hostvars from other nodes, -which therefore aren't available: - -``` -[root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml - "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", - "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", - "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", - "nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}", - "openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}", - "openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}", - "openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}", - "openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}", - "prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}", - "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" -``` - -More generally, there is nothing to stop any group var depending on a -"{{ hostvars[] }}" interpolation ... From 11580b3d4855f01bf9f6108802b4edf5b3625227 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:09:04 +0000 Subject: [PATCH 103/182] Remove use of FIPs for leafcloud packer builds (#498) --- environments/.stackhpc/LEAFCLOUD.pkrvars.hcl | 5 ++++- environments/.stackhpc/SMS.pkrvars.hcl | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl index 5adf4199c..db0b28b49 100644 --- a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl +++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl @@ -4,4 +4,7 @@ networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] -floating_ip_network = "external" +# see environments/.stackhpc/inventory/group_vars/all/bastion.yml: +ssh_bastion_username = "slurm-app-ci" +ssh_bastion_host = "195.114.30.222" +ssh_bastion_private_key_file = "~/.ssh/id_rsa" diff --git a/environments/.stackhpc/SMS.pkrvars.hcl b/environments/.stackhpc/SMS.pkrvars.hcl index b88106fe8..3ebe734eb 100644 --- a/environments/.stackhpc/SMS.pkrvars.hcl +++ b/environments/.stackhpc/SMS.pkrvars.hcl @@ -2,6 +2,7 @@ flavor = "general.v1.small" networks = ["e2b9e59f-43da-4e1c-b558-dc9da4c0d738"] # stackhpc-ipv4-geneve ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" +# see environments/.stackhpc/inventory/group_vars/all/bastion.yml: ssh_bastion_username = "slurm-app-ci" ssh_bastion_host = "185.45.78.150" -ssh_bastion_private_key_file = "~/.ssh/id_rsa" \ No newline at end of file +ssh_bastion_private_key_file = "~/.ssh/id_rsa" From 1ba41d8bfb1b1e3ec716cb39c4a34bc3ed8f4cb1 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:54:45 +0000 Subject: [PATCH 104/182] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 7c59abf36..be9dfe5cb 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241216-1607-2357a730", - "RL9": "openhpc-RL9-241216-1607-2357a730" + "RL8": "openhpc-RL8-241217-1146-d77be652", + "RL9": "openhpc-RL9-241217-1145-d77be652" } } From a868642a8995130c187046762d00a68d109e5c0a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 17 Dec 2024 13:09:19 +0000 Subject: [PATCH 105/182] bump CI image --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 5b9d845ef..1495ce5a7 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241211-1322-ded60c2c", - "RL9": "openhpc-RL9-241211-1322-ded60c2c" + "RL8": "openhpc-RL8-241217-1210-5e7f8092", + "RL9": "openhpc-RL9-241217-1209-5e7f8092" } } From 4b0e36dd7f9a67c73e69980732cbbb908c7b5889 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 17 Dec 2024 13:33:51 +0000 Subject: [PATCH 106/182] now performs update in fatimage --- .github/workflows/fatimage.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 331035001..6649a3533 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -23,11 +23,11 @@ jobs: matrix: # build RL8, RL9 build: - image_name: openhpc-RL8 - source_image_name: rocky-latest-RL8 - inventory_groups: control,compute,login + source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + inventory_groups: control,compute,login,update - image_name: openhpc-RL9 - source_image_name: rocky-latest-RL9 - inventory_groups: control,compute,login + source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: control,compute,login,update env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack From bc36b78121e8828855be9b247f2ea07fe8113882 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 17 Dec 2024 14:19:08 +0000 Subject: [PATCH 107/182] testing enabling release train for 8.10 --- .github/workflows/nightlybuild.yml | 2 +- ansible/bootstrap.yml | 1 - ansible/disable-repos.yml | 1 - ansible/roles/dnf_repos/defaults/main.yml | 24 +++++++++++++++---- .../inventory/group_vars/all/defaults.yml | 5 ++++ 5 files changed, 25 insertions(+), 8 deletions(-) diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 2485cd2df..ec920ce8d 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -25,7 +25,7 @@ jobs: matrix: # build RL8, RL9 build: - image_name: rocky-latest-RL8 - source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.qcow2 inventory_groups: update - image_name: rocky-latest-RL9 source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index a504f3545..e2497d9c6 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -122,7 +122,6 @@ ansible.builtin.include_role: name: dnf_repos tasks_from: set_repos.yml - when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided # --- tasks after here require access to package repos --- - hosts: squid diff --git a/ansible/disable-repos.yml b/ansible/disable-repos.yml index d7dc4fd55..3e8022965 100644 --- a/ansible/disable-repos.yml +++ b/ansible/disable-repos.yml @@ -5,4 +5,3 @@ ansible.builtin.include_role: name: dnf_repos tasks_from: disable_repos.yml - when: ansible_distribution_major_version == "9" #TODO update role once RL8 config decided diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 4a0c9fd2a..eb740e084 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -4,18 +4,32 @@ dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" dnf_repos_username: "{{ omit }}" dnf_repos_password: "{{ omit }}" +dnf_repos_filenames: + '8': + baseos: 'Rocky-BaseOS' + appstream: 'Rocky-AppStream' + crb: 'Rocky-PowerTools' + extras: 'Rocky-Extras' + '9': + baseos: 'rocky' + appstream: 'rocky' + crb: 'rocky' + extras: 'rocky-extras' + +dnf_repos_version_filenames: "{{ dnf_repos_filenames[ansible_distribution_major_version] }}" + # epel installed separately dnf_repos_repolist: -- file: rocky +- file: "{{ dnf_repos_version_filenames.baseos }}" name: baseos base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/BaseOS/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.baseos[ansible_distribution_version] }}" -- file: rocky +- file: "{{ dnf_repos_version_filenames.appstream }}" name: appstream base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/AppStream/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.appstream[ansible_distribution_version] }}" -- file: rocky - name: crb +- file: "{{ dnf_repos_version_filenames.crb }}" + name: "{{ 'powertools' if ansible_distribution_major_version == '8' else 'crb' }}" base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/CRB/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.crb[ansible_distribution_version] }}" -- file: rocky-extras +- file: "{{ dnf_repos_version_filenames.extras }}" name: extras base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.extras[ansible_distribution_version] }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 1bac4590d..a9b7224d8 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -85,11 +85,16 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us appliances_repo_timestamps: baseos: '9.4': 20240816T002610 + '8.10': 20241217T123729 appstream: '9.4': 20240816T002610 + '8.10': 20241217T123729 crb: '9.4': 20240816T002610 + '8.10': 20241217T123729 extras: '9.4': 20240816T002610 + '8.10': 20241217T123729 epel: '9': 20240902T080424 + '8': 20241216T235733 From a9e53ba6a79857d09f1b8b95b5fe919089a9677d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 17 Dec 2024 14:21:40 +0000 Subject: [PATCH 108/182] Temporarily (?) building from rocky 8 genericcloud + update in fatimage --- .github/workflows/fatimage.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 331035001..7d7571133 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -23,8 +23,8 @@ jobs: matrix: # build RL8, RL9 build: - image_name: openhpc-RL8 - source_image_name: rocky-latest-RL8 - inventory_groups: control,compute,login + source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.qcow2 + inventory_groups: control,compute,login,update - image_name: openhpc-RL9 source_image_name: rocky-latest-RL9 inventory_groups: control,compute,login From 47b7bb3691d42705a8db37156b3e935a82651b30 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 17 Dec 2024 14:27:46 +0000 Subject: [PATCH 109/182] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 125180527..67e267dfb 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241213-1416-9065bb6d", - "RL9": "openhpc-RL9-241213-1417-9065bb6d" + "RL8": "openhpc-RL8-241217-1341-eeb88386", + "RL9": "openhpc-RL9-241217-1341-eeb88386" } } From 7fe3ca5b2b6ec7f005012f919c799bbe11257eec Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 17 Dec 2024 15:08:13 +0000 Subject: [PATCH 110/182] docs suggestions Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- docs/experimental/pulp.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index e0f32cdc1..6d30bec6b 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -1,11 +1,11 @@ # Pulp Server -In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's "Ark" Pulp server. The appliance can sync relevant repositories to a local Pulp server which will then be used instead of Ark. Using a local Pulp can be enabled by adding `pulp` to the build groups and overriding `appliances_pulp_url` to point at the local Pulp's URL. +In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's "Ark" Pulp server. The appliance can sync relevant repositories to a local Pulp server which will then be used instead of Ark. ## Deploying/configuring Pulp Server ### Deploying a Pulp server -A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml' playbook. This can be run with +A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml` playbook. This can be run with `ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server="` where `target_host` is any resolvable host. This will print a Pulp URL which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note access to this server's content isn't authenticated so assumes the server is deployed behind a secure network. From 1faf4e523cf7db91e59327d978ab0aeffd05a41e Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 17 Dec 2024 15:10:41 +0000 Subject: [PATCH 111/182] stopped openhpc overwriting epel 8 --- environments/common/inventory/group_vars/all/openhpc.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index a23bc77ba..cf2762f17 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -41,10 +41,4 @@ openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if applianc ohpc_default_extra_repos: "9": [] #overriding to ensure doesn't overwrite ark epel repo - "8": - - name: epel - file: epel - description: "Extra Packages for Enterprise Linux 8 - $basearch" - metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" - gpgcheck: true - gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" + "8": [] From 6ce4953d483ba8939bdd6c344b0ecc068179a258 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 18 Dec 2024 08:57:08 +0000 Subject: [PATCH 112/182] fixed broken powertools repo --- ansible/roles/dnf_repos/defaults/main.yml | 2 +- ansible/roles/dnf_repos/tasks/set_repos.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index eb740e084..89a8229f7 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -28,7 +28,7 @@ dnf_repos_repolist: base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/AppStream/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.appstream[ansible_distribution_version] }}" - file: "{{ dnf_repos_version_filenames.crb }}" name: "{{ 'powertools' if ansible_distribution_major_version == '8' else 'crb' }}" - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/CRB/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.crb[ansible_distribution_version] }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/{{ 'PowerTools' if ansible_distribution_major_version == '8' else 'CRB' }}/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.crb[ansible_distribution_version] }}" - file: "{{ dnf_repos_version_filenames.extras }}" name: extras base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.extras[ansible_distribution_version] }}" diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index fe5e2c02c..c9fcb0c07 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -8,6 +8,7 @@ description: "{{ item.name }}" username: "{{ dnf_repos_username }}" password: "{{ dnf_repos_password }}" + gpgcheck: false loop: "{{ dnf_repos_repolist }}" - name: Install epel-release From 29a157910b3c32414899d88b22dbb7446ce57bd0 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 18 Dec 2024 09:57:33 +0000 Subject: [PATCH 113/182] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 989b9f9bb..5e5acebeb 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241216-1146-18b220e1", - "RL9": "openhpc-RL9-241216-1146-18b220e1" + "RL8": "openhpc-RL8-241218-0900-a99d8be6", + "RL9": "openhpc-RL9-241218-0859-a99d8be6" } } From ee4ab93037e90806282c4da650a2fe816ac04b7a Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 18 Dec 2024 11:12:12 +0000 Subject: [PATCH 114/182] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 67e267dfb..db25176e2 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241217-1341-eeb88386", - "RL9": "openhpc-RL9-241217-1341-eeb88386" + "RL8": "openhpc-RL8-241218-1011-5effb3fa", + "RL9": "openhpc-RL9-241218-1011-5effb3fa" } } From 82ef12bd60e1e8ee39cfb2a852531d36965f1a5c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 12:01:04 +0000 Subject: [PATCH 115/182] support nfs for compute-init --- ansible/roles/compute_init/README.md | 18 +++++++++++------- .../roles/compute_init/files/compute-init.yml | 18 ++++++++++++++++-- ansible/roles/compute_init/tasks/install.yml | 5 ++++- .../common/inventory/group_vars/all/nfs.yml | 2 +- 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index dac59e2d3..2931986ba 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -32,9 +32,12 @@ to configure the node before the services on the control node are available (which requires running the site.yml playbook). The following roles are currently fully functional: -- `resolv_conf` -- `etc_hosts` -- `stackhpc.openhpc` +- `resolv_conf`: all functionality +- `etc_hosts`: all functionality +- `nfs`: client functionality only +- `stackhpc.openhpc`: all functionality, except that the control server name + must be the control node's `inventory_hostname`; `openhpc_slurm_control_host` + and `openhpc_slurm_control_host_address` are ignored. # Development/debugging @@ -113,7 +116,8 @@ as in step 3. More generally, there is nothing to stop any group var depending on a "{{ hostvars[] }}" interpolation ... - Currently, the only functionality this has been problematic for is setting - the control node address for the slurmd node, which has been done using - the (metadata-provided) IP, given this is needed to do the NFS mount anyway - in the absence of working internal DNS. + Currently, this has been worked around for the following cases: + - The inventory hostname for the control node, indirected via `.api_address` + in the above hostvars. This is needed for the default nfs configuration + and the slurmctld namne. For compute-init this has been Defined using + "{{ groups['control'] | first }}" as the hostvars do include the groups. diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 74face5e1..66cf755d4 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -13,7 +13,17 @@ # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects # this is a good example: common environment actually defines this (non-functional w/o compute groups), but role default is empty resolv_conf_nameservers: [] - + + nfs_client_mnt_point: "/mnt" + nfs_client_mnt_options: + nfs_client_mnt_state: mounted + nfs_configurations: + nfs_enable: + clients: false + nfs_enable: + server: false + clients: false + tasks: - block: - name: Report skipping initialization if not compute node @@ -95,6 +105,10 @@ when: enable_etc_hosts # TODO: - name: NFS client mount + - name: If nfs-clients is present + include_tasks: nfs-clients.yml + when: nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool) + loop: "{{ nfs_configurations }}" # TODO: - name: Manila mount @@ -116,7 +130,7 @@ - name: Set slurmctld location for configless operation lineinfile: path: /etc/sysconfig/slurmd - line: "SLURMD_OPTIONS='--conf-server {{ server_node_ip }}'" + line: "SLURMD_OPTIONS='--conf-server {{ groups['control'] | first }}'" regexp: "^SLURMD_OPTIONS=" create: yes owner: root diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index 4eef5deb8..fc96d3a18 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -12,6 +12,7 @@ - files - library - filter_plugins + - playbooks - name: Inject files from roles copy: @@ -33,7 +34,9 @@ dest: library/os_manila_share.py - src: ../../basic_users/filter_plugins/filter_keys.py dest: filter_plugins/filter_keys.py - + - src: ../../stackhpc.nfs/tasks/nfs-clients.yml + dest: playbooks/nfs-clients.yml + - name: Add filter_plugins to ansible.cfg lineinfile: path: /etc/ansible-init/ansible.cfg diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index e9366da2b..7960809ca 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -3,7 +3,7 @@ # See: https://github.com/stackhpc/ansible-role-cluster-nfs # for variable definitions -nfs_server_default: "{{ hostvars[groups['control'] | first ].internal_address }}" +nfs_server_default: "{{ groups['control'] | first }}" # avoid using hostvars so nfs_configurations: - comment: Export /exports/home from Slurm control node as /home From 9049d30dc6cfdf47e378e667303c0ac5e0129f4e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 12:01:26 +0000 Subject: [PATCH 116/182] fix compute-init README typos --- ansible/roles/compute_init/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 2931986ba..733cdb80f 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -50,9 +50,9 @@ To develop/debug this without actually having to build an image: 2. Reimage the compute nodes: - ansible-playbook --limit compute ansible/adhoc/rebuild + ansible-playbook --limit compute ansible/adhoc/rebuild.yml -3. Add metadata to a compute node e.g. via Horzon to turn on compute-init +3. Add metadata to a compute node e.g. via Horizon to turn on compute-init playbook functionality. 4. Fake an image build to deploy the compute-init playbook: From 79f52f9bd91d1761ef5a3e46e5130c9564de9f17 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 12:01:52 +0000 Subject: [PATCH 117/182] fix typo in resolv_conf metadata --- ansible/roles/compute_init/files/compute-init.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 66cf755d4..1e902e073 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -7,7 +7,7 @@ os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" server_node_ip: "{{ os_metadata.meta.k3s_server }}" enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}" - enable_resolv_conf: "{{ os_metadata.meta.enable_senable_resolv_conf | default(false) | bool }}" + enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}" enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects From 54381501031495f894f57396ebecdd94a3bf4a0d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 18 Dec 2024 12:45:11 +0000 Subject: [PATCH 118/182] added 9.5 ark snapshots + bumped genericcloud --- .github/workflows/fatimage.yml | 2 +- ansible/adhoc/sync-pulp.yml | 2 +- docs/experimental/pulp.md | 2 +- environments/common/inventory/group_vars/all/defaults.yml | 4 ++++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 6649a3533..fb6395e45 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -26,7 +26,7 @@ jobs: source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 inventory_groups: control,compute,login,update - image_name: openhpc-RL9 - source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + source_image_name: Rocky-9-GenericCloud-Base-9.5-20241118.0.x86_64.qcow2 inventory_groups: control,compute,login,update env: ANSIBLE_FORCE_COLOR: True diff --git a/ansible/adhoc/sync-pulp.yml b/ansible/adhoc/sync-pulp.yml index f26149bba..b2cd9a8c4 100644 --- a/ansible/adhoc/sync-pulp.yml +++ b/ansible/adhoc/sync-pulp.yml @@ -6,5 +6,5 @@ vars: pulp_site_target_arch: "x86_64" pulp_site_target_distribution: "rocky" - pulp_site_target_distribution_version: "9.4" + pulp_site_target_distribution_version: "9.5" pulp_site_target_distribution_version_major: "9" diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index 6d30bec6b..fb2cda023 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -14,4 +14,4 @@ An existing Pulp server can be used to host Ark repos by overriding `pulp_site_p ## Syncing Pulp content with Ark -If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must authenticate with Ark by overriding `pulp_site_upstream_username` and `pulp_site_upstream_password` with your vault encrypted Ark dev credentials. `dnf_repos_username` and `dnf_repos_password` must remain unset to access content from the local Pulp. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`. By default this syncs repositories for Rocky 9.4 with x86_64 architecture, but can be overriden by setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version_major`. +If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must authenticate with Ark by overriding `pulp_site_upstream_username` and `pulp_site_upstream_password` with your vault encrypted Ark dev credentials. `dnf_repos_username` and `dnf_repos_password` must remain unset to access content from the local Pulp. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`. By default this syncs repositories for Rocky 9.5 with x86_64 architecture, but can be overriden by setting extravars for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version_major`. diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 29724fb6f..3ff5ba02a 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -85,11 +85,15 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us appliances_repo_timestamps: baseos: '9.4': 20241115T011711 + '9.5': 20241216T013503 appstream: '9.4': 20241112T003151 + '9.5': 20241217T005008 crb: '9.4': 20241115T003133 + '9.5': 20241217T005008 extras: '9.4': 20241118T002802 + '9.5': 20241218T004632 epel: '9': 20241213T010218 From e7c96ad67c89c3a05d7eef0b8521998e9cb279c7 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 18 Dec 2024 13:36:39 +0000 Subject: [PATCH 119/182] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 67e267dfb..74bf0295c 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241217-1341-eeb88386", - "RL9": "openhpc-RL9-241217-1341-eeb88386" + "RL8": "openhpc-RL8-241218-1254-54381501", + "RL9": "openhpc-RL9-241218-1254-54381501" } } From 4f81b89e5ca6509bca0ab8538db35655abaa0226 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 13:51:55 +0000 Subject: [PATCH 120/182] fix nfs and make openhpc fully-capable in compute-init --- ansible/roles/compute_init/README.md | 22 ++++++++++--------- .../roles/compute_init/files/compute-init.yml | 16 ++++++++------ ansible/roles/compute_init/tasks/install.yml | 6 ++--- .../common/inventory/group_vars/all/nfs.yml | 2 +- .../inventory/group_vars/all/openhpc.yml | 2 +- 5 files changed, 26 insertions(+), 22 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 733cdb80f..83de9d73f 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -35,9 +35,7 @@ The following roles are currently fully functional: - `resolv_conf`: all functionality - `etc_hosts`: all functionality - `nfs`: client functionality only -- `stackhpc.openhpc`: all functionality, except that the control server name - must be the control node's `inventory_hostname`; `openhpc_slurm_control_host` - and `openhpc_slurm_control_host_address` are ignored. +- `stackhpc.openhpc`: all functionality # Development/debugging @@ -96,8 +94,8 @@ as in step 3. support certain subsets of role functionality or variables Examples: resolv_conf, stackhpc.openhpc -- Some hostvars are tempalted from hostvars from other nodes, which aren't - available in the current approach: +- Some variables are defined using hostvars from other nodes, which aren't + available v the current approach: ``` [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml @@ -116,8 +114,12 @@ as in step 3. More generally, there is nothing to stop any group var depending on a "{{ hostvars[] }}" interpolation ... - Currently, this has been worked around for the following cases: - - The inventory hostname for the control node, indirected via `.api_address` - in the above hostvars. This is needed for the default nfs configuration - and the slurmctld namne. For compute-init this has been Defined using - "{{ groups['control'] | first }}" as the hostvars do include the groups. + Only `nfs_server_default` and `openhpc_slurm_control_host` are of concern + for compute nodes - both of these indirect via `api_address` to + `inventory_hostname`. This has been worked around by replacing this with + "{{ groups['control'] | first }}" which does result in the control node + inventory hostname when templating. + + Note that although `groups` is defined in the templated hostvars, when + the hostvars are loaded using `include_vars:` is is ignored as it is a + "magic variable" determined by ansible itself and cannot be set. diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 1e902e073..ce10a890f 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -9,9 +9,9 @@ enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}" enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}" enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" + enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects - # this is a good example: common environment actually defines this (non-functional w/o compute groups), but role default is empty resolv_conf_nameservers: [] nfs_client_mnt_point: "/mnt" @@ -20,9 +20,8 @@ nfs_configurations: nfs_enable: clients: false - nfs_enable: - server: false - clients: false + + # openhpc: no defaults required tasks: - block: @@ -106,8 +105,10 @@ # TODO: - name: NFS client mount - name: If nfs-clients is present - include_tasks: nfs-clients.yml - when: nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool) + include_tasks: ../tasks/nfs-clients.yml + when: + - enable_nfs + - nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool) loop: "{{ nfs_configurations }}" # TODO: - name: Manila mount @@ -130,7 +131,7 @@ - name: Set slurmctld location for configless operation lineinfile: path: /etc/sysconfig/slurmd - line: "SLURMD_OPTIONS='--conf-server {{ groups['control'] | first }}'" + line: "SLURMD_OPTIONS='--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}'" regexp: "^SLURMD_OPTIONS=" create: yes owner: root @@ -152,3 +153,4 @@ - name: Ensure node is resumed # TODO: consider if this is always safe for all job states? command: scontrol update state=resume nodename={{ ansible_hostname }} + # TODO: make safe for repeated runs diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index fc96d3a18..8f36aa836 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -12,7 +12,7 @@ - files - library - filter_plugins - - playbooks + - tasks - name: Inject files from roles copy: @@ -35,7 +35,7 @@ - src: ../../basic_users/filter_plugins/filter_keys.py dest: filter_plugins/filter_keys.py - src: ../../stackhpc.nfs/tasks/nfs-clients.yml - dest: playbooks/nfs-clients.yml + dest: tasks/nfs-clients.yml - name: Add filter_plugins to ansible.cfg lineinfile: @@ -52,4 +52,4 @@ dest: /etc/ansible-init/playbooks/1-compute-init.yml owner: root group: root - mode: 0644 \ No newline at end of file + mode: 0644 diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index 7960809ca..45b7c6967 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -3,7 +3,7 @@ # See: https://github.com/stackhpc/ansible-role-cluster-nfs # for variable definitions -nfs_server_default: "{{ groups['control'] | first }}" # avoid using hostvars so +nfs_server_default: "{{ groups['control'] | first }}" # avoid using hostvars for compute-init nfs_configurations: - comment: Export /exports/home from Slurm control node as /home diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index a23bc77ba..84fe6ef57 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -13,7 +13,7 @@ openhpc_slurm_accounting_storage_type: 'accounting_storage/slurmdbd' openhpc_slurmdbd_mysql_database: slurm_acct_db openhpc_slurmdbd_mysql_password: "{{ vault_mysql_slurm_password }}" openhpc_slurmdbd_mysql_username: slurm -openhpc_slurm_control_host: "{{ hostvars[groups['control'].0].api_address }}" +openhpc_slurm_control_host: "{{ groups['control'] | first }}" # avoid using hostvars for compute-init openhpc_slurmdbd_host: "{{ openhpc_slurm_control_host }}" openhpc_slurm_partitions: - name: "compute" From 9859d542d5278ff595a9f53a8ae84910ff465937 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 14:02:30 +0000 Subject: [PATCH 121/182] make compute-init safe for rerunning ansible-init --- ansible/roles/compute_init/files/compute-init.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index ce10a890f..fbad53012 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -153,4 +153,7 @@ - name: Ensure node is resumed # TODO: consider if this is always safe for all job states? command: scontrol update state=resume nodename={{ ansible_hostname }} - # TODO: make safe for repeated runs + register: _scontrol_update + failed_when: + - _scontrol_update.rc > 0 + - "'slurm_update error: Invalid node state specified' not in _scontrol_update.stderr" From e0d0c06b126c56848cc1da7a93c9aa8a63463efd Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 14:40:10 +0000 Subject: [PATCH 122/182] support manila in compute-init --- .../roles/compute_init/files/compute-init.yml | 87 ++++++++++++++++++- ansible/roles/compute_init/tasks/export.yml | 11 +++ ansible/roles/compute_init/tasks/install.yml | 10 +-- .../inventory/group_vars/all/manila.yml | 5 ++ 4 files changed, 104 insertions(+), 9 deletions(-) create mode 100644 environments/.stackhpc/inventory/group_vars/all/manila.yml diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index fbad53012..fb853407b 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -10,6 +10,7 @@ enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}" enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}" + enable_manila: "{{ os_metadata.meta.enable_manila | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] @@ -23,6 +24,16 @@ # openhpc: no defaults required + os_manila_mount_shares: [] + os_manila_mount_ceph_conf_path: /etc/ceph + os_manila_mount_state: mounted + os_manila_mount_opts: + - x-systemd.device-timeout=30 + - x-systemd.mount-timeout=30 + - noatime + - _netdev # prevents mount blocking early boot before networking available + - rw + tasks: - block: - name: Report skipping initialization if not compute node @@ -103,15 +114,85 @@ mode: 0644 when: enable_etc_hosts - # TODO: - name: NFS client mount + # NFS client mount - name: If nfs-clients is present - include_tasks: ../tasks/nfs-clients.yml + include_tasks: tasks/nfs-clients.yml when: - enable_nfs - nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool) loop: "{{ nfs_configurations }}" - # TODO: - name: Manila mount + - name: Manila mounts + block: + - name: Read manila share info from nfs file + include_vars: + file: /mnt/cluster/manila_share_info.yml + no_log: true # contains secrets + + - name: Ensure Ceph configuration directory exists + ansible.builtin.file: + path: "{{ os_manila_mount_ceph_conf_path }}" + state: directory + mode: "0755" + owner: root + group: root + + - name: Configure ceph.conf using os_manila_mount_host + ansible.builtin.template: + src: ceph.conf.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.conf" + owner: root + group: root + mode: "0600" + + - name: Ensure mount directory exists + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Write Ceph client keyring + ansible.builtin.template: + src: ceph.keyring.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.client.{{ item.share_user }}.keyring" + mode: "0600" + owner: root + group: root + loop: "{{ os_manila_mount_share_info }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Mount the Ceph share + ansible.posix.mount: + path: "{{ item[0].mount_path }}" + src: "{{ item[1].host }}:{{ item[1].export }}" + fstype: ceph + opts: "name={{ item[1].share_user }},{{ (item[0].mount_opts | default(os_manila_mount_opts)) | join(',') }}" + # NB share_user is looked up here in case of autodetection + state: "{{ item[0].mount_state | default(os_manila_mount_state) }}" + loop: "{{ os_manila_mount_shares | zip(os_manila_mount_share_info) }}" + loop_control: + label: "{{ item[0].share_name }}" + + - name: Ensure mounted directory has correct permissions + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] + when: + - enable_manila + - os_manila_mount_shares | length > 0 # TODO: - name: Basic users setup diff --git a/ansible/roles/compute_init/tasks/export.yml b/ansible/roles/compute_init/tasks/export.yml index 3e9340cb5..d1d609895 100644 --- a/ansible/roles/compute_init/tasks/export.yml +++ b/ansible/roles/compute_init/tasks/export.yml @@ -33,3 +33,14 @@ dest: /exports/cluster/hostvars/{{ inventory_hostname }}/hostvars.yml mode: u=rw,go= delegate_to: "{{ groups['control'] | first }}" + +- name: Copy manila share info to /exports/cluster + copy: + content: "{{ os_manila_mount_share_info_var | to_nice_yaml }}" + dest: /exports/cluster/manila_share_info.yml + run_once: true + delegate_to: "{{ groups['control'] | first }}" + when: os_manila_mount_share_info is defined + vars: + os_manila_mount_share_info_var: + os_manila_mount_share_info: "{{ os_manila_mount_share_info }}" diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index 8f36aa836..29a2f53e7 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -2,7 +2,7 @@ - name: Ensure directories exist file: - path: "/etc/ansible-init/{{ item }}" + path: "/etc/ansible-init/playbooks/{{ item }}" state: directory owner: root group: root @@ -17,7 +17,7 @@ - name: Inject files from roles copy: src: '{{ item.src }}' - dest: '/etc/ansible-init/{{ item.dest }}' + dest: '/etc/ansible-init/playbooks/{{ item.dest }}' owner: root group: root mode: 0644 @@ -30,10 +30,8 @@ dest: templates/ceph.keyring.j2 - src: ../../resolv_conf/files/NetworkManager-dns-none.conf dest: files/NetworkManager-dns-none.conf - - src: ../../stackhpc.os-manila-mount/library/os_manila_share.py - dest: library/os_manila_share.py - - src: ../../basic_users/filter_plugins/filter_keys.py - dest: filter_plugins/filter_keys.py + # - src: ../../basic_users/filter_plugins/filter_keys.py + # dest: filter_plugins/filter_keys.py - src: ../../stackhpc.nfs/tasks/nfs-clients.yml dest: tasks/nfs-clients.yml diff --git a/environments/.stackhpc/inventory/group_vars/all/manila.yml b/environments/.stackhpc/inventory/group_vars/all/manila.yml new file mode 100644 index 000000000..767a5dde8 --- /dev/null +++ b/environments/.stackhpc/inventory/group_vars/all/manila.yml @@ -0,0 +1,5 @@ +os_manila_mount_shares_arcus: + - share_name: slurm-v2-home + mount_path: /project + +os_manila_mount_shares: "{{ os_manila_mount_shares_arcus if ci_cloud == 'ARCUS' else [] }}" From 68bec3eaeb931ebb32312adee4c7b025f712aca5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 15:35:50 +0000 Subject: [PATCH 123/182] test manila if running on arcus --- environments/.stackhpc/inventory/group_vars/all/manila.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environments/.stackhpc/inventory/group_vars/all/manila.yml b/environments/.stackhpc/inventory/group_vars/all/manila.yml index 767a5dde8..59f935873 100644 --- a/environments/.stackhpc/inventory/group_vars/all/manila.yml +++ b/environments/.stackhpc/inventory/group_vars/all/manila.yml @@ -1,5 +1,7 @@ os_manila_mount_shares_arcus: - share_name: slurm-v2-home mount_path: /project + - share_name: slurm-scratch + mount_path: /scratch os_manila_mount_shares: "{{ os_manila_mount_shares_arcus if ci_cloud == 'ARCUS' else [] }}" From 14e7dc66d5d5254e495aee476b849d57ae187f8f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 15:52:51 +0000 Subject: [PATCH 124/182] support basic_users in compute-init --- ansible/roles/compute_init/README.md | 7 ++-- .../roles/compute_init/files/compute-init.yml | 34 ++++++++++++++++++- ansible/roles/compute_init/tasks/install.yml | 4 +-- 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 83de9d73f..94d9cd51c 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -31,11 +31,14 @@ The check in 4b. above is what prevents the compute-init script from trying to configure the node before the services on the control node are available (which requires running the site.yml playbook). -The following roles are currently fully functional: +The following roles/groups are currently fully functional: - `resolv_conf`: all functionality - `etc_hosts`: all functionality - `nfs`: client functionality only -- `stackhpc.openhpc`: all functionality +- `manila`: all functionality +- `openhpc`: all functionality +- `basic_users`: all functionality, assumes home directory already exists on + shared storage # Development/debugging diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index fb853407b..1c37bbdc9 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -11,6 +11,7 @@ enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}" enable_manila: "{{ os_metadata.meta.enable_manila | default(false) | bool }}" + enable_basic_users: "{{ os_metadata.meta.enable_basic_users | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] @@ -34,6 +35,15 @@ - _netdev # prevents mount blocking early boot before networking available - rw + basic_users_groups: [] + basic_users_manage_homedir: false # homedir must already exist on shared filesystem + basic_users_userdefaults: + state: present + create_home: "{{ basic_users_manage_homedir }}" + generate_ssh_key: "{{ basic_users_manage_homedir }}" + ssh_key_comment: "{{ item.name }}" + basic_users_users: [] + tasks: - block: - name: Report skipping initialization if not compute node @@ -194,7 +204,29 @@ - enable_manila - os_manila_mount_shares | length > 0 - # TODO: - name: Basic users setup + - name: Basic users + block: + - name: Create groups + ansible.builtin.group: "{{ item }}" + loop: "{{ basic_users_groups }}" + + - name: Create users + user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}" + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }} [{{ item.state | default('present') }}]" + register: basic_users_info + + - name: Write sudo rules + blockinfile: + path: /etc/sudoers.d/80-{{ item.name}}-user + block: "{{ item.sudo }}" + create: true + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }}" + when: "'sudo' in item" + when: enable_basic_users # TODO: - name: Configure EESSI diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index 29a2f53e7..bbcbf133f 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -30,8 +30,8 @@ dest: templates/ceph.keyring.j2 - src: ../../resolv_conf/files/NetworkManager-dns-none.conf dest: files/NetworkManager-dns-none.conf - # - src: ../../basic_users/filter_plugins/filter_keys.py - # dest: filter_plugins/filter_keys.py + - src: ../../basic_users/filter_plugins/filter_keys.py + dest: filter_plugins/filter_keys.py - src: ../../stackhpc.nfs/tasks/nfs-clients.yml dest: tasks/nfs-clients.yml From a2418ef42998e41a415388de667b42f57f963755 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 16:09:52 +0000 Subject: [PATCH 125/182] support eessi in compute-init --- ansible/roles/compute_init/README.md | 2 ++ .../roles/compute_init/files/compute-init.yml | 17 +++++++++++++-- ansible/roles/compute_init/tasks/export.yml | 21 +++++++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 94d9cd51c..e9a045342 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -39,6 +39,8 @@ The following roles/groups are currently fully functional: - `openhpc`: all functionality - `basic_users`: all functionality, assumes home directory already exists on shared storage +- `eessi`: all functionality, assumes `cvmfs_config` is the same on control + node and all compute nodes. # Development/debugging diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 1c37bbdc9..6327151bb 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -12,6 +12,7 @@ enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}" enable_manila: "{{ os_metadata.meta.enable_manila | default(false) | bool }}" enable_basic_users: "{{ os_metadata.meta.enable_basic_users | default(false) | bool }}" + enable_eessi: "{{ os_metadata.meta.enable_eessi | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] @@ -228,9 +229,21 @@ when: "'sudo' in item" when: enable_basic_users - # TODO: - name: Configure EESSI + - name: EESSI + block: + - name: Copy cvmfs config + copy: + src: /mnt/cluster/cvmfs/default.local + dest: /etc/cvmfs/default.local + owner: root + group: root + mode: 0644 + + - name: Ensure CVMFS config is setup + command: + cmd: "cvmfs_config setup" + when: enable_eessi - # TODO: - name: Configure openhpc # NB: don't need conditional block on enable_compute as have already exited # if not the case - name: Write Munge key diff --git a/ansible/roles/compute_init/tasks/export.yml b/ansible/roles/compute_init/tasks/export.yml index d1d609895..12b648f6e 100644 --- a/ansible/roles/compute_init/tasks/export.yml +++ b/ansible/roles/compute_init/tasks/export.yml @@ -44,3 +44,24 @@ vars: os_manila_mount_share_info_var: os_manila_mount_share_info: "{{ os_manila_mount_share_info }}" + +- name: Ensure /exports/cluster/cvmfs directory exists + file: + path: /exports/cluster/cvmfs + state: directory + owner: root + group: root + mode: 0755 + run_once: true + delegate_to: "{{ groups['control'] | first }}" + +- name: Copy EESSI CVMFS config to /exports/cluster + copy: + src: /etc/cvmfs/default.local + dest: /exports/cluster/cvmfs/default.local + owner: root + group: root + mode: 0644 + remote_src: true + run_once: true + delegate_to: "{{ groups['control'] | first }}" From cdbf005750a979bc430f00243323a6af70eecda0 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 16:19:28 +0000 Subject: [PATCH 126/182] change metadat from k3s_server to control_address --- ansible/roles/cluster_infra/templates/resources.tf.j2 | 4 ++-- ansible/roles/compute_init/files/compute-init.yml | 2 +- ansible/roles/k3s/files/start_k3s.yml | 2 +- .../{{cookiecutter.environment}}/terraform/compute.tf | 2 +- .../{{cookiecutter.environment}}/terraform/compute/nodes.tf | 2 +- .../terraform/compute/variables.tf | 4 ++-- .../skeleton/{{cookiecutter.environment}}/terraform/nodes.tf | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index 453f01a7e..69d001105 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -399,7 +399,7 @@ resource "openstack_compute_instance_v2" "login" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} - k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 k3s_token = "{{ k3s_token }}" } } @@ -565,7 +565,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} - k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 k3s_token = "{{ k3s_token }}" } } diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 6327151bb..7c2ad6ae2 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -5,7 +5,7 @@ become: yes vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" - server_node_ip: "{{ os_metadata.meta.k3s_server }}" + server_node_ip: "{{ os_metadata.meta.control_address }}" enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}" enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}" enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 8ee0e6114..b9b82f1c4 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -3,7 +3,7 @@ vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" k3s_token: "{{ os_metadata.meta.k3s_token }}" - k3s_server_name: "{{ os_metadata.meta.k3s_server }}" + k3s_server_name: "{{ os_metadata.meta.control_address }}" service_name: "{{ 'k3s-agent' if k3s_server_name is defined else 'k3s' }}" tasks: - name: Ensure password directory exists diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index eb2139eba..14c728a5a 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -16,6 +16,6 @@ module "compute" { key_pair = var.key_pair environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] + control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index e64a2162c..7a2a706a6 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -47,7 +47,7 @@ resource "openstack_compute_instance_v2" "compute" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = var.k3s_server + control_address = var.control_address } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index 9d2c2e47c..3655c9e65 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -72,7 +72,7 @@ variable "k3s_token" { type = string } -variable "k3s_server" { - description = "Name/address of k3s server" +variable "control_address" { + description = "Name/address of control node" type = string } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf index bfbd1c532..8ea8cabcb 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf @@ -126,7 +126,7 @@ resource "openstack_compute_instance_v2" "login" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] + control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] } user_data = <<-EOF From 3b9eb467fd859585395d1f8450f4254dcffe75a3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 18 Dec 2024 16:28:23 +0000 Subject: [PATCH 127/182] fixup resolv_conf support in cloud-init --- ansible/roles/compute_init/README.md | 2 +- ansible/roles/compute_init/files/compute-init.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index e9a045342..77a127245 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -36,11 +36,11 @@ The following roles/groups are currently fully functional: - `etc_hosts`: all functionality - `nfs`: client functionality only - `manila`: all functionality -- `openhpc`: all functionality - `basic_users`: all functionality, assumes home directory already exists on shared storage - `eessi`: all functionality, assumes `cvmfs_config` is the same on control node and all compute nodes. +- `openhpc`: all functionality # Development/debugging diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 7c2ad6ae2..c7a9048b4 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -94,7 +94,7 @@ block: - name: Set nameservers in /etc/resolv.conf ansible.builtin.template: - src: /etc/ansible-init/templates/resolv.conf.j2 + src: resolv.conf.j2 dest: /etc/resolv.conf owner: root group: root @@ -102,7 +102,7 @@ - name: Disable NetworkManager control of resolv.conf ansible.builtin.copy: - src: /etc/ansible-init/files/NetworkManager-dns-none.conf + src: files/NetworkManager-dns-none.conf dest: /etc/NetworkManager/conf.d/90-dns-none.conf owner: root group: root From 15ed0a3880e046aacf589ab28dbfe4d5b532c7a0 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 18 Dec 2024 16:52:45 +0000 Subject: [PATCH 128/182] Bump RL9.4 repo timestamps to latest snapshots (#497) * bumped repo timestamps to latest * bump * now performs update in fatimage * bump * bump --- .github/workflows/fatimage.yml | 8 ++++---- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- .../common/inventory/group_vars/all/defaults.yml | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 331035001..6649a3533 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -23,11 +23,11 @@ jobs: matrix: # build RL8, RL9 build: - image_name: openhpc-RL8 - source_image_name: rocky-latest-RL8 - inventory_groups: control,compute,login + source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + inventory_groups: control,compute,login,update - image_name: openhpc-RL9 - source_image_name: rocky-latest-RL9 - inventory_groups: control,compute,login + source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: control,compute,login,update env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 7c59abf36..db25176e2 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241216-1607-2357a730", - "RL9": "openhpc-RL9-241216-1607-2357a730" + "RL8": "openhpc-RL8-241218-1011-5effb3fa", + "RL9": "openhpc-RL9-241218-1011-5effb3fa" } } diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 1bac4590d..29724fb6f 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -84,12 +84,12 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us appliances_repo_timestamps: baseos: - '9.4': 20240816T002610 + '9.4': 20241115T011711 appstream: - '9.4': 20240816T002610 + '9.4': 20241112T003151 crb: - '9.4': 20240816T002610 + '9.4': 20241115T003133 extras: - '9.4': 20240816T002610 + '9.4': 20241118T002802 epel: - '9': 20240902T080424 + '9': 20241213T010218 From fed2d6eb7de7d35b9609e190007afdd3d41266da Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:09:21 +0000 Subject: [PATCH 129/182] Pin nvidia-driver and cuda packages to working packages (#496) * move cuda tasks to install * pin nvidia driver to working version and autodetect os/arch * make install of cuda packages optional * don't run cuda install tasks unless during build * move doca install before cuda * update cuda docs * add cuda to extra build test CI * add cuda runtime tasks * fix typo in extras playbook * bump extra build size to 30GB for cuda * pin both cuda package version * make cuda idempotent/restartable * allow using computed tasks_from for cuda role * fix showing image summary * rename nvidia driver version var * bump CI image --- .github/workflows/{doca.yml => extra.yml} | 21 +++++++++----- ansible/cleanup.yml | 3 +- ansible/extras.yml | 3 +- ansible/fatimage.yml | 12 ++++++-- ansible/roles/cuda/README.md | 8 ++--- ansible/roles/cuda/defaults/main.yml | 7 ++--- .../cuda/tasks/{main.yml => install.yml} | 29 ++++++++++++++----- ansible/roles/cuda/tasks/runtime.yml | 5 ++++ .../terraform/cluster_image.auto.tfvars.json | 4 +-- 9 files changed, 61 insertions(+), 31 deletions(-) rename .github/workflows/{doca.yml => extra.yml} (89%) rename ansible/roles/cuda/tasks/{main.yml => install.yml} (60%) create mode 100644 ansible/roles/cuda/tasks/runtime.yml diff --git a/.github/workflows/doca.yml b/.github/workflows/extra.yml similarity index 89% rename from .github/workflows/doca.yml rename to .github/workflows/extra.yml index cfd3bb982..dece242ce 100644 --- a/.github/workflows/doca.yml +++ b/.github/workflows/extra.yml @@ -1,4 +1,4 @@ -name: Test DOCA extra build +name: Test extra build on: workflow_dispatch: push: @@ -7,16 +7,18 @@ on: paths: - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - - '.github/workflows/doca' + - 'ansible/roles/cuda/**' + - '.github/workflows/extra.yml' pull_request: paths: - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - - '.github/workflows/doca' + - 'ansible/roles/cuda/**' + - '.github/workflows/extra.yml' jobs: doca: - name: doca-build + name: extra-build concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true @@ -25,12 +27,14 @@ jobs: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 build: - - image_name: openhpc-doca-RL8 + - image_name: openhpc-extra-RL8 source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json - inventory_groups: doca - - image_name: openhpc-doca-RL9 + inventory_groups: doca,cuda + volume_size: 30 # needed for cuda + - image_name: openhpc-extra-RL9 source_image_name_key: RL9 - inventory_groups: doca + inventory_groups: doca,cuda + volume_size: 30 # needed for cuda env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -95,6 +99,7 @@ jobs: -var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \ -var "image_name=${{ matrix.build.image_name }}" \ -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ + -var "volume_size=${{ matrix.build.volume_size }}" \ openstack.pkr.hcl - name: Get created image names from manifest diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index 3f059d157..670a99b29 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -66,5 +66,4 @@ slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}" - name: Show image summary - debug: - var: image_info + command: cat /var/lib/image/image.json diff --git a/ansible/extras.yml b/ansible/extras.yml index 107f85252..0a74541a5 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -24,8 +24,9 @@ gather_facts: yes tags: cuda tasks: - - import_role: + - include_role: name: cuda + tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}" - name: Persist hostkeys across rebuilds # Must be after filesystems.yml (for storage) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 55e56e612..c35be5b64 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -29,6 +29,14 @@ - import_playbook: bootstrap.yml +- hosts: doca + become: yes + gather_facts: yes + tasks: + - name: Install NVIDIA DOCA + import_role: + name: doca + - name: Run post-bootstrap.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" @@ -220,8 +228,6 @@ import_role: name: doca -- import_playbook: disable-repos.yml - - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" @@ -229,6 +235,8 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists +- import_playbook: disable-repos.yml + - hosts: builder become: yes gather_facts: yes diff --git a/ansible/roles/cuda/README.md b/ansible/roles/cuda/README.md index 141e7b80d..be6439cd5 100644 --- a/ansible/roles/cuda/README.md +++ b/ansible/roles/cuda/README.md @@ -1,6 +1,6 @@ # cuda -Install NVIDIA CUDA. The CUDA binaries are added to the PATH for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled. +Install NVIDIA drivers and optionally CUDA packages. CUDA binaries are added to the `$PATH` for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled. ## Prerequisites @@ -8,8 +8,8 @@ Requires OFED to be installed to provide required kernel-* packages. ## Role Variables -- `cuda_distro`: Optional. Default `rhel8`. -- `cuda_repo`: Optional. Default `https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo` -- `cuda_driver_stream`: Optional. The default value `default` will, on first use of this role, enable the dkms-flavour `nvidia-driver` DNF module stream with the current highest version number. The `latest-dkms` stream is not enabled, and subsequent runs of the role will *not* change the enabled stream, even if a later version has become available. Changing this value once an `nvidia-driver` stream has been enabled raises an error. If an upgrade of the `nvidia-driver` module is required, the currently-enabled stream and all packages should be manually removed. +- `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture. +- `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version. - `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`. +- `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA. - `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`. diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 33a25d9b4..05f1e093d 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,7 +1,6 @@ -cuda_distro: "rhel{{ ansible_distribution_major_version }}" -cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo" -cuda_driver_stream: default -cuda_package_version: 'latest' +cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" +cuda_nvidia_driver_stream: '560-open' # 565-open has problems with cuda packages +cuda_package_version: '12.6.3-1' cuda_packages: - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" - nvidia-gds diff --git a/ansible/roles/cuda/tasks/main.yml b/ansible/roles/cuda/tasks/install.yml similarity index 60% rename from ansible/roles/cuda/tasks/main.yml rename to ansible/roles/cuda/tasks/install.yml index 22f8e9e8e..51c92a0d3 100644 --- a/ansible/roles/cuda/tasks/main.yml +++ b/ansible/roles/cuda/tasks/install.yml @@ -1,7 +1,7 @@ # Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation -- name: Check for OFED +- name: Check for OFED/DOCA command: cmd: dnf list --installed rdma-core register: _dnf_rdma_core @@ -10,41 +10,53 @@ - name: Assert OFED installed assert: that: "'mlnx' in _dnf_rdma_core.stdout" - fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED installed?" + fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?" - name: Install cuda repo get_url: - dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo" - url: "{{ cuda_repo }}" + dest: "/etc/yum.repos.d/cuda-rhel{{ ansible_distribution_major_version }}.repo" + url: "{{ cuda_repo_url }}" - name: Check if nvidia driver module is enabled - shell: - cmd: dnf module list --enabled nvidia-driver + ansible.builtin.command: dnf module list --enabled nvidia-driver changed_when: false failed_when: false register: _cuda_driver_module_enabled - name: Enable nvidia driver module - ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms" + ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_nvidia_driver_stream }}" register: _cuda_driver_module_enable when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout" +- name: Check if nvidia driver module is installed + ansible.builtin.command: dnf module list --installed nvidia-driver + changed_when: false + failed_when: false + register: _cuda_driver_module_installed + - name: Install nvidia drivers ansible.builtin.command: dnf module install -y nvidia-driver register: _cuda_driver_install - when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" + when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr" changed_when: "'Nothing to do' not in _cuda_driver_install.stdout" +- name: Check kernel has not been modified + assert: + that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched + fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}" + - name: Install cuda packages ansible.builtin.dnf: name: "{{ cuda_packages }}" + when: cuda_package_version != 'none' register: cuda_package_install - name: Add cuda binaries to path lineinfile: path: /etc/profile.d/sh.local line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin' + when: cuda_package_version != 'none' - name: Enable NVIDIA Persistence Daemon systemd: @@ -60,3 +72,4 @@ - name: Wait for hosts to be reachable wait_for_connection: sleep: 15 + when: cuda_package_install.changed diff --git a/ansible/roles/cuda/tasks/runtime.yml b/ansible/roles/cuda/tasks/runtime.yml new file mode 100644 index 000000000..c16a48c6f --- /dev/null +++ b/ansible/roles/cuda/tasks/runtime.yml @@ -0,0 +1,5 @@ +- name: Ensure NVIDIA Persistence Daemon state + systemd: + name: nvidia-persistenced + enabled: true + state: "{{ cuda_persistenced_state }}" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index db25176e2..be2f156a3 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241218-1011-5effb3fa", - "RL9": "openhpc-RL9-241218-1011-5effb3fa" + "RL8": "openhpc-RL8-241218-1705-09ac4268", + "RL9": "openhpc-RL9-241218-1705-09ac4268" } } From 722a0c1a0d41a7c8ebdd350dfe24d739526e2665 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Dec 2024 11:04:19 +0000 Subject: [PATCH 130/182] moved pulp subpaths into common structure --- ansible/filter_plugins/utils.py | 4 ++++ ansible/roles/dnf_repos/defaults/main.yml | 12 ++++------ ansible/roles/dnf_repos/tasks/set_repos.yml | 4 ++-- ansible/roles/pulp_site/defaults/main.yml | 23 ++++++++----------- .../inventory/group_vars/all/defaults.yml | 22 +++++++++++++----- 5 files changed, 37 insertions(+), 28 deletions(-) diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index f69d6f3f7..15ba14777 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -48,6 +48,9 @@ def to_ood_regex(items): r = ['(%s)' % v for v in r] return '|'.join(r) +def appliances_repo_to_subpath(repo_entry): + return repo_entry.path+'/'+repo_entry.timestamp + class FilterModule(object): ''' Ansible core jinja2 filters ''' @@ -63,4 +66,5 @@ def filters(self): 'exists': exists, 'warn': self.warn, 'to_ood_regex': to_ood_regex, + 'appliance_repo_to_subpath': appliances_repo_to_subpath } diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 4a0c9fd2a..afc773b2f 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -1,6 +1,4 @@ dnf_repos_pulp_content_url: "{{ appliances_pulp_url }}/pulp/content" -dnf_repos_rocky_prefix: "{{ ansible_distribution | lower }}/{{ ansible_distribution_version }}" -dnf_repos_epel_prefix: "epel/{{ ansible_distribution_major_version }}" dnf_repos_username: "{{ omit }}" dnf_repos_password: "{{ omit }}" @@ -8,16 +6,16 @@ dnf_repos_password: "{{ omit }}" dnf_repos_repolist: - file: rocky name: baseos - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/BaseOS/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.baseos[ansible_distribution_version] }}" + subpath: "{{ appliances_pulp_repos.baseos[ansible_distribution_version] | appliances_repo_to_subpath }}" - file: rocky name: appstream - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/AppStream/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.appstream[ansible_distribution_version] }}" + subpath: "{{ appliances_pulp_repos.appstream[ansible_distribution_version] | appliances_repo_to_subpath }}" - file: rocky name: crb - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/CRB/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.crb[ansible_distribution_version] }}" + subpath: "{{ appliances_pulp_repos.crb[ansible_distribution_version] | appliances_repo_to_subpath }}" - file: rocky-extras name: extras - base_url: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_rocky_prefix }}/extras/{{ ansible_architecture }}/os/{{ appliances_repo_timestamps.extras[ansible_distribution_version] }}" + subpath: "{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" -dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/epel/{{ ansible_distribution_major_version }}/Everything/{{ ansible_architecture }}/{{ appliances_repo_timestamps.epel[ansible_distribution_major_version] }}" +dnf_repos_epel_subpath: "{{ appliances_pulp_repos.epel[ansible_distribution_major_version] | appliances_repo_to_subpath }}" dnf_repos_epel_description: "epel" diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index fe5e2c02c..aab5b85e4 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -4,7 +4,7 @@ ansible.builtin.yum_repository: file: "{{ item.file }}" name: "{{ item.name }}" - baseurl: "{{ item.base_url }}" + baseurl: "{{ dnf_repos_pulp_content_url }}/{{ item.subpath }}" description: "{{ item.name }}" username: "{{ dnf_repos_username }}" password: "{{ dnf_repos_password }}" @@ -21,6 +21,6 @@ file: epel description: "{{ dnf_repos_epel_description }}" gpgcheck: false - baseurl: "{{ dnf_repos_epel_baseurl }}" + baseurl: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_epel_subpath }}" username: "{{ dnf_repos_username }}" password: "{{ dnf_repos_password }}" diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index d343d4998..fc23a4489 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -3,28 +3,25 @@ pulp_site_port: 8080 pulp_site_username: admin # shouldn't be changed pulp_site_password: "{{ vault_pulp_admin_password }}" pulp_site_upstream_content_url: https://ark.stackhpc.com/pulp/content -_pulp_site_rocky_prefix: "{{ pulp_site_target_distribution }}/{{ pulp_site_target_distribution_version }}" pulp_site_default_upstream_suffix: "{{ pulp_site_target_arch }}/os" pulp_site_validate_certs: false pulp_site_install_dir: '/home/rocky/pulp' pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" pulp_site_target_facts: "{{ hostvars[groups['builder'][0]]['ansible_facts'] }}" -pulp_site_target_arch: "{{ pulp_site_target_facts['architecture'] }}" -pulp_site_target_distribution: "{{ pulp_site_target_facts['distribution'] | lower }}" pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" pulp_site_rpm_info: -- name: "baseos-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.baseos[pulp_site_target_distribution_version] }}" - subpath: "{{ _pulp_site_rocky_prefix }}/BaseOS/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.baseos[pulp_site_target_distribution_version] }}" -- name: "appstream-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.appstream[pulp_site_target_distribution_version] }}" - subpath: "{{ _pulp_site_rocky_prefix }}/AppStream/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.appstream[pulp_site_target_distribution_version] }}" -- name: "crb-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.crb[pulp_site_target_distribution_version] }}" - subpath: "{{ _pulp_site_rocky_prefix }}/{{ 'PowerTools' if pulp_site_target_distribution_version_major == '8' else 'CRB' }}/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.crb[pulp_site_target_distribution_version] }}" -- name: "extras-{{ pulp_site_target_distribution_version }}-{{ appliances_repo_timestamps.extras[pulp_site_target_distribution_version] }}" - subpath: "{{ _pulp_site_rocky_prefix }}/extras/{{ pulp_site_default_upstream_suffix }}/{{ appliances_repo_timestamps.extras[pulp_site_target_distribution_version] }}" -- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ appliances_repo_timestamps.epel[pulp_site_target_distribution_version_major] }}" - subpath: "epel/{{ pulp_site_target_distribution_version_major }}/Everything/{{ pulp_site_target_arch }}/{{ appliances_repo_timestamps.epel[pulp_site_target_distribution_version_major] }}" +- name: "baseos-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.baseos[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.baseos[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" +- name: "appstream-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.appstream[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.appstream[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" +- name: "crb-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.crb[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.crb[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" +- name: "extras-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" +- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" pulp_site_rpm_repo_defaults: remote_username: "{{ pulp_site_upstream_username }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 29724fb6f..96db68667 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -82,14 +82,24 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us ########################################################################################### -appliances_repo_timestamps: +appliances_pulp_repositories: baseos: - '9.4': 20241115T011711 + '9.4': + timestamp: 20241115T011711 + path: rocky/9.4/BaseOS/x86_64/os appstream: - '9.4': 20241112T003151 + '9.4': + timestamp: 20241112T003151 + path: rocky/9.4/AppStream/x86_64/os crb: - '9.4': 20241115T003133 + '9.4': + timestamp: 20241115T003133 + path: rocky/9.4/CRB/x86_64/os extras: - '9.4': 20241118T002802 + '9.4': + timestamp: 20241118T002802 + path: rocky/9.4/extras/x86_64/os epel: - '9': 20241213T010218 + '9': + timestamp: 20241213T010218 + path: epel/9/Everything/x86_64 \ No newline at end of file From d1f3c69e8defaa25908ddd657e8b4ffb3ef3639d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Dec 2024 11:30:09 +0000 Subject: [PATCH 131/182] typos --- ansible/filter_plugins/utils.py | 4 ++-- environments/common/inventory/group_vars/all/defaults.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index 15ba14777..9559d0fee 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -49,7 +49,7 @@ def to_ood_regex(items): return '|'.join(r) def appliances_repo_to_subpath(repo_entry): - return repo_entry.path+'/'+repo_entry.timestamp + return repo_entry['path']+'/'+repo_entry['timestamp'] class FilterModule(object): ''' Ansible core jinja2 filters ''' @@ -66,5 +66,5 @@ def filters(self): 'exists': exists, 'warn': self.warn, 'to_ood_regex': to_ood_regex, - 'appliance_repo_to_subpath': appliances_repo_to_subpath + 'appliances_repo_to_subpath': appliances_repo_to_subpath } diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 96db68667..7a7d5c7c0 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -82,7 +82,7 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us ########################################################################################### -appliances_pulp_repositories: +appliances_pulp_repos: baseos: '9.4': timestamp: 20241115T011711 From 357f7e25e7ccdefb8748096d25cb8f9315c63ce9 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 19 Dec 2024 11:41:06 +0000 Subject: [PATCH 132/182] docs suggestions Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- docs/operations.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/operations.md b/docs/operations.md index 50eef9053..edf8881f0 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -74,8 +74,10 @@ By default, the following utility packages are installed during build: - postfix - git - latest python version for system (3.6 for for Rocky 8.9 and 3.12 for Rocky 9.4) -Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_other_extra_package` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: +Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_other_extra_packages` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: + +```yaml # environments/foo-base/inventory/group_vars/all/defaults.yml: appliances_other_extra_package: - somepackage @@ -84,7 +86,7 @@ Additional packages from any DNF repositories which are enabled during build (wh The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the [OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note "user-facing" OpenHPC packages such as compilers, mpi libraries etc. include corresponding `lmod` modules. -If you wish to install packages during runtime, the `site.yml` playbook should be run `appliances_packages_during_configure` overriden to `true` and `cluster` should be added as a child of the `dnf_repos` group in order to temporarily re-enabled DNF repositories during runtime (WARNING: this should only be done if using an unauthenticated local Pulp server. If using StackHPC Ark directly, doing this WILL leak credentials to users). +If you wish to install packages during runtime, the `site.yml` playbook should be run with `appliances_packages_during_configure` overriden to `true` and `cluster` should be added as a child of the `dnf_repos` group in order to temporarily re-enable DNF repositories during runtime (WARNING: this should only be done if using an unauthenticated local Pulp server. If using StackHPC Ark directly, doing this WILL leak credentials to users). If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a plat should NOT exclude the builder group, so that the repositories are also added to built images. There are various Ansible modules which might be useful for this: - `ansible.builtin.yum_repository`: Add a repo from an URL providing a 'repodata' directory. From 17499e7fa386825114049da04d86221da6a34aa5 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Dec 2024 12:14:32 +0000 Subject: [PATCH 133/182] dnf_repos urls fully overridable again --- ansible/roles/dnf_repos/defaults/main.yml | 10 +++++----- ansible/roles/dnf_repos/tasks/set_repos.yml | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index afc773b2f..bff89b4a9 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -6,16 +6,16 @@ dnf_repos_password: "{{ omit }}" dnf_repos_repolist: - file: rocky name: baseos - subpath: "{{ appliances_pulp_repos.baseos[ansible_distribution_version] | appliances_repo_to_subpath }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.baseos[ansible_distribution_version] | appliances_repo_to_subpath }}" - file: rocky name: appstream - subpath: "{{ appliances_pulp_repos.appstream[ansible_distribution_version] | appliances_repo_to_subpath }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.appstream[ansible_distribution_version] | appliances_repo_to_subpath }}" - file: rocky name: crb - subpath: "{{ appliances_pulp_repos.crb[ansible_distribution_version] | appliances_repo_to_subpath }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.crb[ansible_distribution_version] | appliances_repo_to_subpath }}" - file: rocky-extras name: extras - subpath: "{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" -dnf_repos_epel_subpath: "{{ appliances_pulp_repos.epel[ansible_distribution_major_version] | appliances_repo_to_subpath }}" +dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.epel[ansible_distribution_major_version] | appliances_repo_to_subpath }}" dnf_repos_epel_description: "epel" diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index aab5b85e4..fe5e2c02c 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -4,7 +4,7 @@ ansible.builtin.yum_repository: file: "{{ item.file }}" name: "{{ item.name }}" - baseurl: "{{ dnf_repos_pulp_content_url }}/{{ item.subpath }}" + baseurl: "{{ item.base_url }}" description: "{{ item.name }}" username: "{{ dnf_repos_username }}" password: "{{ dnf_repos_password }}" @@ -21,6 +21,6 @@ file: epel description: "{{ dnf_repos_epel_description }}" gpgcheck: false - baseurl: "{{ dnf_repos_pulp_content_url }}/{{ dnf_repos_epel_subpath }}" + baseurl: "{{ dnf_repos_epel_baseurl }}" username: "{{ dnf_repos_username }}" password: "{{ dnf_repos_password }}" From 1e2e6d8a2c319d4d9c1e5b5d83f5bea64aed7b77 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 19 Dec 2024 13:28:34 +0000 Subject: [PATCH 134/182] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index be9dfe5cb..4f21c6b99 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241217-1146-d77be652", - "RL9": "openhpc-RL9-241217-1145-d77be652" + "RL8": "openhpc-RL8-241219-1232-7f84fed4", + "RL9": "openhpc-RL9-241219-1145-7f84fed4" } } From 6a8ecda6ce5c2074ba5d37cc955947626a73b7b1 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 19 Dec 2024 14:52:11 +0000 Subject: [PATCH 135/182] variable renames from review Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- environments/common/inventory/group_vars/all/defaults.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 9ce228493..417eb9eed 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -82,7 +82,7 @@ appliances_local_users: "{{ appliances_local_users_default + appliances_local_us ################## bootstrap: extra package installs ###################################### -appliances_default_extra_packages: +appliances_extra_packages_default: - htop - nano - screen @@ -95,11 +95,11 @@ appliances_default_extra_packages: - "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}" -appliances_packages_during_configure: false +appliances_extra_packages_during_configure: false -appliances_other_extra_packages: [] +appliances_extra_packages_other: [] -appliances_extra_packages: "{{ appliances_default_extra_packages + appliances_other_extra_packages }}" +appliances_extra_packages: "{{ appliances_extra_packages_default + appliances_extra_packages_other }}" ###################### ark repo timestamps ################################################### From ef33eefbef836f1dcff8d3d91be9e164d7fc9a84 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Dec 2024 14:55:35 +0000 Subject: [PATCH 136/182] updated docs --- docs/operations.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/operations.md b/docs/operations.md index edf8881f0..4bebe1b3f 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -75,11 +75,11 @@ By default, the following utility packages are installed during build: - git - latest python version for system (3.6 for for Rocky 8.9 and 3.12 for Rocky 9.4) -Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_other_extra_packages` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: +Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_extra_packages_other` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: ```yaml # environments/foo-base/inventory/group_vars/all/defaults.yml: - appliances_other_extra_package: + appliances_extra_packages_other: - somepackage - anotherpackage From a3be506598ff8f818c0d3c053bc0f3c2a56f8dc0 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 20 Dec 2024 09:35:22 +0000 Subject: [PATCH 137/182] missed variable rename --- ansible/extras.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/extras.yml b/ansible/extras.yml index e5ea63408..fd1aa4c1c 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -54,4 +54,4 @@ - name: Install additional packages dnf: name: "{{ appliances_extra_packages }}" - when: appliances_mode != 'configure' or appliances_packages_during_configure + when: appliances_mode != 'configure' or appliances_extra_packages_during_configure From a0ba5f17b37caba2941db4e2a1c4550883797af0 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 20 Dec 2024 13:16:53 +0000 Subject: [PATCH 138/182] bump fatimage --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 4f21c6b99..8a9e3b66a 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241219-1232-7f84fed4", - "RL9": "openhpc-RL9-241219-1145-7f84fed4" + "RL8": "openhpc-RL8-241220-1131-a2dde143", + "RL9": "openhpc-RL9-241220-1131-a2dde143" } } From ada3dc9a428ad723817d9144bab675282356a619 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 2 Jan 2025 09:55:27 +0000 Subject: [PATCH 139/182] Review linting changes Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/filter_plugins/utils.py | 4 +++- ansible/roles/pulp_site/defaults/main.yml | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index 9559d0fee..1187b3c4b 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -49,7 +49,9 @@ def to_ood_regex(items): return '|'.join(r) def appliances_repo_to_subpath(repo_entry): - return repo_entry['path']+'/'+repo_entry['timestamp'] + """ Take an element from appliances_pulp_repos and convert it to a pulp path. This assumes that the remote and local pulp structures are the same + """ + return repo_entry['path'] + '/' + repo_entry['timestamp'] class FilterModule(object): ''' Ansible core jinja2 filters ''' diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index fc23a4489..081307b6a 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -7,7 +7,7 @@ pulp_site_default_upstream_suffix: "{{ pulp_site_target_arch }}/os" pulp_site_validate_certs: false pulp_site_install_dir: '/home/rocky/pulp' pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" -pulp_site_target_facts: "{{ hostvars[groups['builder'][0]]['ansible_facts'] }}" +pulp_site_target_facts: "{{ hostvars[groups['pulp'][0]]['ansible_facts'] }}" pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" From a7690151f6fefe44ffe24aecd9a856b2862d5a5a Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Thu, 2 Jan 2025 10:59:23 +0000 Subject: [PATCH 140/182] Add note about login node reboot (#510) --- ansible/roles/openondemand/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/openondemand/README.md b/ansible/roles/openondemand/README.md index c6a4f3f9f..365265df0 100644 --- a/ansible/roles/openondemand/README.md +++ b/ansible/roles/openondemand/README.md @@ -17,7 +17,7 @@ This uses the [osc.ood](https://github.com/OSC/ood-ansible) Ansible role to prov ### General - `openondemand_clusters`: Required. Synonym for [osc.ood: clusters](https://github.com/OSC/ood-ansible#clusters) role variable. -- `openondemand_servername`: Required. Synonym for [osc.ood: servername](https://github.com/OSC/ood-ansible/blob/master/defaults/main/ood_portal.yml#L27) role variable. This defines what the Open Ondemand portal's Apache server uses for the [name-based virtual host](https://httpd.apache.org/docs/current/mod/core.html#servername). It should be the IP or hostname(+domain) part of the URL used to access Open Ondemand in the browser, e.g. `ondemand.mysite.org`. **NB:** If a domain or external IP is not available, specify the host's internal IP here and use ssh with a `DynamicForward` option and a SOCKS proxy to access this address. Using ssh's `LocalForward` option is not recommended as the server name will have to be `localhost` which causes some issues. +- `openondemand_servername`: Required. Synonym for [osc.ood: servername](https://github.com/OSC/ood-ansible/blob/master/defaults/main/ood_portal.yml#L27) role variable. This defines what the Open Ondemand portal's Apache server uses for the [name-based virtual host](https://httpd.apache.org/docs/current/mod/core.html#servername). It should be the IP or hostname(+domain) part of the URL used to access Open Ondemand in the browser, e.g. `ondemand.mysite.org`. **NB:** If a domain or external IP is not available, specify the host's internal IP here and use ssh with a `DynamicForward` option and a SOCKS proxy to access this address. Using ssh's `LocalForward` option is not recommended as the server name will have to be `localhost` which causes some issues. Changing this value on an already deployed cluster requires a reboot of the login node for OOD app state to be correctly refreshed. ### Authentication See the Open Ondemand [Authentication docs](https://osc.github.io/ood-documentation/latest/authentication/overview.html) for an overview of the authentication process. @@ -77,7 +77,7 @@ The Open Ondemand portal can proxy other servers. Variables: to proxy: - All "compute" nodes, e.g. for Open Ondemand interactive apps such as remote desktop and Jupyter notebook server. - The Grafana server - note a link to Grafana is always added to the Open Ondemand dashboard. - + The exact pattern depends on inventory hostnames / partitions / addresses. - `openondemand_node_proxy_directives`: Optional, default ''. Multiline string to insert into Apache directives definition for `node_uri` ([docs](https://osc.github.io/ood-documentation/master/reference/files/ood-portal-yml.html#configure-reverse-proxy)). From 36ca0d5c0f1e21f74444aa69df6270362cc81885 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 2 Jan 2025 12:12:34 +0000 Subject: [PATCH 141/182] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 8a9e3b66a..8061cf356 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241220-1131-a2dde143", - "RL9": "openhpc-RL9-241220-1131-a2dde143" + "RL8": "openhpc-RL8-250102-1135-8c98e169", + "RL9": "openhpc-RL9-250102-1135-8c98e169" } } From 5fddb85ac8cc579e97f252af1caac4f160b9c265 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 2 Jan 2025 12:13:44 +0000 Subject: [PATCH 142/182] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 8a9e3b66a..f9e568c3f 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241220-1131-a2dde143", - "RL9": "openhpc-RL9-241220-1131-a2dde143" + "RL8": "openhpc-RL8-250102-1138-77cfc703", + "RL9": "openhpc-RL9-250102-1139-77cfc703" } } From 9a07ff4ddd516f7217404c79e60f7840b200a99f Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 3 Jan 2025 13:23:58 +0000 Subject: [PATCH 143/182] Stop Lustre deleting rdma packages + add to extrabuild test (#502) * move cuda tasks to install * pin nvidia driver to working version and autodetect os/arch * make install of cuda packages optional * don't run cuda install tasks unless during build * move doca install before cuda * update cuda docs * add cuda to extra build test CI * add cuda runtime tasks * fix typo in extras playbook * bump extra build size to 30GB for cuda * pin both cuda package version * make cuda idempotent/restartable * allow using computed tasks_from for cuda role * fix showing image summary * removed faulty cleanup and added lustre to extrabuild test * bumped lustre to supported version --------- Co-authored-by: Steve Brasier --- .github/workflows/extra.yml | 6 +++-- ansible/fatimage.yml | 8 ------- ansible/roles/lustre/README.md | 2 +- ansible/roles/lustre/defaults/main.yml | 2 +- ansible/roles/lustre/tasks/install.yml | 31 +++++--------------------- 5 files changed, 11 insertions(+), 38 deletions(-) diff --git a/.github/workflows/extra.yml b/.github/workflows/extra.yml index dece242ce..bf438c336 100644 --- a/.github/workflows/extra.yml +++ b/.github/workflows/extra.yml @@ -8,12 +8,14 @@ on: - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - 'ansible/roles/cuda/**' + - 'ansible/roles/lustre/**' - '.github/workflows/extra.yml' pull_request: paths: - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - 'ansible/roles/cuda/**' + - 'ansible/roles/lustre/**' - '.github/workflows/extra.yml' jobs: @@ -29,11 +31,11 @@ jobs: build: - image_name: openhpc-extra-RL8 source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json - inventory_groups: doca,cuda + inventory_groups: doca,cuda,lustre volume_size: 30 # needed for cuda - image_name: openhpc-extra-RL9 source_image_name_key: RL9 - inventory_groups: doca,cuda + inventory_groups: doca,cuda,lustre volume_size: 30 # needed for cuda env: ANSIBLE_FORCE_COLOR: True diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 9f1e9107c..9a8828a35 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -230,14 +230,6 @@ name: cloudalchemy.grafana tasks_from: install.yml -- hosts: doca - become: yes - gather_facts: yes - tasks: - - name: Install NVIDIA DOCA - import_role: - name: doca - - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/roles/lustre/README.md b/ansible/roles/lustre/README.md index c0a25e037..3ba0dad56 100644 --- a/ansible/roles/lustre/README.md +++ b/ansible/roles/lustre/README.md @@ -8,7 +8,7 @@ Install and configure a Lustre client. This builds RPM packages from source. ## Role Variables -- `lustre_version`: Optional str. Version of lustre to build, default `2.15.5` which is the first version with EL9 support +- `lustre_version`: Optional str. Version of lustre to build, default `2.15.6` which is the first version with EL9.5 support - `lustre_lnet_label`: Optional str. The "lnet label" part of the host's NID, e.g. `tcp0`. Only the `tcp` protocol type is currently supported. Default `tcp`. - `lustre_mgs_nid`: Required str. The NID(s) for the MGS, e.g. `192.168.227.11@tcp1` (separate mutiple MGS NIDs using `:`). - `lustre_mounts`: Required list. Define Lustre filesystems and mountpoints as a list of dicts with keys: diff --git a/ansible/roles/lustre/defaults/main.yml b/ansible/roles/lustre/defaults/main.yml index be008ad55..40389970c 100644 --- a/ansible/roles/lustre/defaults/main.yml +++ b/ansible/roles/lustre/defaults/main.yml @@ -1,4 +1,4 @@ -lustre_version: '2.15.5' # https://www.lustre.org/lustre-2-15-5-released/ +lustre_version: '2.15.6' # https://www.lustre.org/lustre-2-15-6-released/ lustre_lnet_label: tcp #lustre_mgs_nid: lustre_mounts: [] diff --git a/ansible/roles/lustre/tasks/install.yml b/ansible/roles/lustre/tasks/install.yml index e0af857cf..852b4652f 100644 --- a/ansible/roles/lustre/tasks/install.yml +++ b/ansible/roles/lustre/tasks/install.yml @@ -41,30 +41,9 @@ ansible.builtin.dnf: name: "{{ _lustre_find_rpms.files | map(attribute='path')}}" disable_gpg_check: yes - -- block: - - name: Remove lustre build prerequisites - # NB Only remove ones this role installed which weren't upgrades - ansible.builtin.dnf: - name: "{{ _new_pkgs }}" - state: absent - vars: - _installed_pkgs: | - {{ - _lustre_dnf_build_packages.results | - select('match', 'Installed:') | - map('regex_replace', '^Installed: (.+?)-[0-9].*$', '\1') - }} - _removed_pkgs: | - {{ - _lustre_dnf_build_packages.results | - select('match', 'Removed:') | - map('regex_replace', '^Removed: (.+?)-[0-9].*$', '\1') - }} - _new_pkgs: "{{ _installed_pkgs | difference(_removed_pkgs) }}" - - - name: Delete lustre build dir - file: - path: "{{ lustre_build_dir }}" - state: absent + +- name: Delete lustre build dir + file: + path: "{{ lustre_build_dir }}" + state: absent when: lustre_build_cleanup | bool From 8c979cdf8c2ee0d80cc21e55a32f2cacef15c746 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 3 Jan 2025 16:43:59 +0000 Subject: [PATCH 144/182] Fix python/ansible/pulp squeezer versions for RL8 deploy hosts (#516) * fix python/ansible/pulp squeezer versions for RL8 deploy hosts * fixed broken pulp dependency --------- Co-authored-by: wtripp180901 --- ansible/filter_plugins/utils.py | 3 +++ dev/setup-env.sh | 1 + requirements.txt | 4 ++-- requirements.yml | 4 ++++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index 1187b3c4b..508f794cc 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -41,6 +41,9 @@ def to_ood_regex(items): eg {{ [compute-001, compute-002, control] | to_regex }} -> '(compute-\d+)|(control)' """ + # NB: for python3.12+ the \d in this function & docstring + # need to be raw strings. See https://docs.python.org/3/reference/lexical_analysis.html + # There's a python bug which means re.sub() can't use '\d' in the replacement so # have to do replacement in two stages: r = [re.sub(r"\d+", 'XBACKSLASHX', v) for v in items] diff --git a/dev/setup-env.sh b/dev/setup-env.sh index bfa0758e6..6d701f2b7 100755 --- a/dev/setup-env.sh +++ b/dev/setup-env.sh @@ -17,6 +17,7 @@ PYTHON_VERSION="" if [[ "$OS" == "ubuntu" && "$MAJOR_VERSION" == "22" ]]; then PYTHON_VERSION="/usr/bin/python3.10" elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "8" ]]; then + # python3.9+ doesn't have selinux bindings PYTHON_VERSION="/usr/bin/python3.8" # use `sudo yum install python38` on Rocky Linux 8 to install this elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "9" ]]; then PYTHON_VERSION="/usr/bin/python3.9" diff --git a/requirements.txt b/requirements.txt index 7d81f3285..872ee9516 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ansible==8.0.0 +ansible==6.7.0 # cloudalchemy.prometheus uses ansible.builtin.include, removed in ansible-core==2.16 => ansible==9 openstacksdk python-openstackclient==6.6.1 # v7.0.0 has a bug re. rebuild python-manilaclient @@ -9,4 +9,4 @@ cookiecutter selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3 netaddr matplotlib -pulp-cli==0.29.2 +pulp-cli==0.23.2 diff --git a/requirements.yml b/requirements.yml index 2ede96950..7e71bb904 100644 --- a/requirements.yml +++ b/requirements.yml @@ -49,6 +49,10 @@ collections: - name: https://github.com/azimuth-cloud/ansible-collection-image-utils type: git version: 0.4.0 + # stackhpc.pulp has pulp.squeezer as dependency, any version, but latest + # requires newer ansible than can install + - name: pulp.squeezer + version: 0.0.15 - name: stackhpc.pulp version: 0.5.5 ... From 510cfd01ea27e9356a0cd07b000ca0363f3dbfad Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 6 Jan 2025 10:00:25 +0000 Subject: [PATCH 145/182] extend cookiecutter terraform config for compute init script --- .../terraform/compute.tf | 2 ++ .../terraform/compute/nodes.tf | 13 ++++++++++--- .../terraform/compute/variables.tf | 6 ++++++ .../terraform/variables.tf | 7 +++++++ 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index 14c728a5a..d52c3c42c 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -18,4 +18,6 @@ module "compute" { k3s_token = var.k3s_token control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] + + compute_init_enable = var.compute_init_enable } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 7a2a706a6..ac34a443c 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -45,9 +45,16 @@ resource "openstack_compute_instance_v2" "compute" { } metadata = { - environment_root = var.environment_root - k3s_token = var.k3s_token - control_address = var.control_address + environment_root = var.environment_root + k3s_token = var.k3s_token + control_address = var.control_address + enable_compute = contains(var.compute_init_enable, "compute") + enable_resolv_conf = contains(var.compute_init_enable, "resolv_conf") + enable_etc_hosts = contains(var.compute_init_enable, "etc_hosts") + enable_nfs = contains(var.compute_init_enable, "nfs") + enable_manila = contains(var.compute_init_enable, "manila") + enable_basic_users = contains(var.compute_init_enable, "basic_users") + enable_eessi = contains(var.compute_init_enable, "eessi") } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index 3655c9e65..a0e90c61b 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -76,3 +76,9 @@ variable "control_address" { description = "Name/address of control node" type = string } + +variable "compute_init_enable" { + type = list(string) + description = "Groups to activate for ansible-init compute rebuilds" + default = [] +} \ No newline at end of file diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 0f5eefa18..19027dd19 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -52,6 +52,7 @@ variable "compute" { image_id: Overrides variable cluster_image_id vnic_type: Overrides variable vnic_type vnic_profile: Overrides variable vnic_profile + compute_init_enable: Toggles ansible-init rebuild EOF } @@ -136,3 +137,9 @@ variable "k3s_token" { description = "K3s cluster authentication token, set automatically by Ansible" type = string } + +variable "compute_init_enable" { + type = list(string) + description = "Groups to activate for ansible-init compute rebuilds" + default = [] +} \ No newline at end of file From 4def5bab5f4ddc01bda873df3e6eff5330afef96 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 7 Jan 2025 09:21:00 +0000 Subject: [PATCH 146/182] Add Release Train OpenHPC repos (#515) * Added OpenHPC release train repos * bump images * Comment update Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * refactored and defaulted toggling ohpc repos * bump images * Updated comment --------- Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/dnf_repos/defaults/main.yml | 12 +++++++++++- ansible/roles/pulp_site/defaults/main.yml | 4 ++++ .../terraform/cluster_image.auto.tfvars.json | 4 ++-- .../common/inventory/group_vars/all/defaults.yml | 15 +++++++++++++++ .../common/inventory/group_vars/all/openhpc.yml | 11 ++++++++++- 5 files changed, 42 insertions(+), 4 deletions(-) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 17114b49d..841631890 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -17,7 +17,7 @@ dnf_repos_filenames: dnf_repos_version_filenames: "{{ dnf_repos_filenames[ansible_distribution_major_version] }}" # epel installed separately -dnf_repos_repolist: +dnf_repos_default_repolist: - file: "{{ dnf_repos_version_filenames.baseos }}" name: baseos base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.baseos[ansible_distribution_version] | appliances_repo_to_subpath }}" @@ -31,5 +31,15 @@ dnf_repos_repolist: name: extras base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" +dnf_repos_openhpc_repolist: +- name: OpenHPC + file: OpenHPC + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_base[ansible_distribution_major_version] | appliances_repo_to_subpath }}" +- name: OpenHPC-updates + file: OpenHPC + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_updates[ansible_distribution_major_version] | appliances_repo_to_subpath }}" + +dnf_repos_repolist: "{{ dnf_repos_default_repolist + (dnf_repos_openhpc_repolist if (openhpc_install_type | default('ohpc')) == 'ohpc' else []) }}" + dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.epel[ansible_distribution_major_version] | appliances_repo_to_subpath }}" dnf_repos_epel_description: "epel" diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index 081307b6a..c0b191336 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -22,6 +22,10 @@ pulp_site_rpm_info: subpath: "{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" - name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major].timestamp }}" subpath: "{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" +- name: "ohpc-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" +- name: "ohpc-updates-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" pulp_site_rpm_repo_defaults: remote_username: "{{ pulp_site_upstream_username }}" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index f9e568c3f..943a2dfbd 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250102-1138-77cfc703", - "RL9": "openhpc-RL9-250102-1139-77cfc703" + "RL8": "openhpc-RL8-250106-0916-f8603056", + "RL9": "openhpc-RL9-250106-0916-f8603056" } } diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index f32d14c60..e052eb709 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -151,3 +151,18 @@ appliances_pulp_repos: '8': timestamp: 20241216T235733 path: epel/8/Everything/x86_64 + openhpc_base: + '8': + path: OpenHPC/2/EL_8 + timestamp: 20241218T154614 + '9': + path: OpenHPC/3/EL_9 + timestamp: 20241218T154614 + openhpc_updates: + '8': + path: OpenHPC/2/updates/EL_8 + timestamp: 20241218T154614 + '9': + path: OpenHPC/3/updates/EL_9 + timestamp: 20241218T154614 + diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index e3d20b9c3..3b3879de9 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -39,6 +39,15 @@ openhpc_config_extra: {} openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}" openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}" +openhpc_install_type: ohpc # 'ohpc' or 'generic', see https://github.com/stackhpc/ansible-slurm-appliance/pull/326 + +# Empty repo lists from stackhpc.openhpc role defaults, as these repofiles are +# now generated by dnf_repos to allow injecting Ark creds: +ohpc_openhpc_repos: + "9": [] + "8": [] + +# overriding to ensure doesn't overwrite Ark epel repo ohpc_default_extra_repos: - "9": [] #overriding to ensure doesn't overwrite ark epel repo + "9": [] "8": [] From 8290a313885dea62421bee125c9460acecf9570a Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 7 Jan 2025 11:04:21 +0000 Subject: [PATCH 147/182] define default compute init flags --- environments/.stackhpc/terraform/compute_init.auto.tfvars | 7 +++++++ environments/.stackhpc/terraform/main.tf | 5 +++++ 2 files changed, 12 insertions(+) create mode 100644 environments/.stackhpc/terraform/compute_init.auto.tfvars diff --git a/environments/.stackhpc/terraform/compute_init.auto.tfvars b/environments/.stackhpc/terraform/compute_init.auto.tfvars new file mode 100644 index 000000000..032ae5adb --- /dev/null +++ b/environments/.stackhpc/terraform/compute_init.auto.tfvars @@ -0,0 +1,7 @@ +compute_init_enable = [ + "compute", + "etc_hosts", + "nfs", + "basic_users", + "eessi" +] diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 4284ec132..d54903cc4 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -58,6 +58,10 @@ variable "k3s_token" { type = string } +variable "compute_init_enable" { + type = list(string) +} + data "openstack_images_image_v2" "cluster" { name = var.cluster_image[var.os_version] most_recent = true @@ -74,6 +78,7 @@ module "cluster" { cluster_image_id = data.openstack_images_image_v2.cluster.id control_node_flavor = var.control_node_flavor k3s_token = var.k3s_token + compute_init_enable = var.compute_init_enable login_nodes = { login-0: var.other_node_flavor From 354ce1e810f4be3836919ac250e3fbb9e1634f9e Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 7 Jan 2025 12:47:42 +0000 Subject: [PATCH 148/182] add CI tests for compute node rebuilds --- .github/workflows/stackhpc.yml | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b08854adb..a5267e508 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -170,33 +170,22 @@ jobs: env: TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - # - name: Build environment-specific compute image - # id: packer_build - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # cd packer/ - # packer init - # PACKER_LOG=1 packer build -except openstack.fatimage -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - # ../dev/output_manifest.py packer-manifest.json # Sets NEW_COMPUTE_IMAGE_ID outputs - - # - name: Test reimage of compute nodes to new environment-specific image (via slurm) - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]" - # ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down - # ansible-playbook -v ansible/ci/check_slurm.yml - - name: Test reimage of login and control nodes (via rebuild adhoc) run: | . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml - ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml + - name: Test reimage of compute nodes and compute-init (via rebuild adhoc) + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml + ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down + ansible-playbook -v ansible/ci/check_slurm.yml + - name: Check sacct state survived reimage run: | . venv/bin/activate From b903cdd0a3350d15eee65a8f4835477e21ed15ca Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 7 Jan 2025 14:59:10 +0000 Subject: [PATCH 149/182] document metadata toggle flags and CI workflow --- ansible/roles/compute_init/README.md | 31 +++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 77a127245..40d9b7326 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -42,10 +42,35 @@ The following roles/groups are currently fully functional: node and all compute nodes. - `openhpc`: all functionality -# Development/debugging +All of the above are defined in the skeleton cookiecutter config, and are +toggleable via a terraform compute_init autovar file. In the .stackhpc +environment, the compute init roles are set by default to: +- `enable_compute`: This encompasses the openhpc role functionality while being + a global toggle for the entire compute-init script. +- `etc_hosts` +- `nfs` +- `basic_users` +- `eessi` + +# CI workflow + +The compute node rebuild is tested in CI after the tests for rebuilding the +login and control nodes. The process follows + +1. Compute nodes are reimaged: + + ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml -To develop/debug this without actually having to build an image: +2. Ansible-init runs against newly reimaged compute nodes + +3. Run sinfo and check nodes have expected slurm state + + ansible-playbook -v ansible/ci/check_slurm.yml + +# Development/debugging +To develop/debug changes to the compute script without actually having to build +a new image: 1. Deploy a cluster using tofu and ansible/site.yml as normal. This will additionally configure the control node to export compute hostvars over NFS. @@ -103,7 +128,7 @@ as in step 3. available v the current approach: ``` - [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml + [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", From 50fc320be89db6e5884323830eb5c548ddbb8199 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 8 Jan 2025 10:13:38 +0000 Subject: [PATCH 150/182] Update ceph to use ark packages and move RL9 to ceph reef (#519) * Release train support for ceph repos * bump images * Update requirements.yml * bumped rocky 9 ceph repos to reef * updated rl9 ceph version number * bump images * reverted to upstream ceph versions * Update requirements.yml * comment --- ansible/roles/dnf_repos/defaults/main.yml | 3 +++ ansible/roles/pulp_site/defaults/main.yml | 2 ++ .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- environments/common/inventory/group_vars/all/defaults.yml | 8 +++++++- .../common/inventory/group_vars/all/os-manila-mount.yml | 3 +++ requirements.yml | 2 +- 6 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 environments/common/inventory/group_vars/all/os-manila-mount.yml diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 841631890..6d41046ec 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -30,6 +30,9 @@ dnf_repos_default_repolist: - file: "{{ dnf_repos_version_filenames.extras }}" name: extras base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" +- file: ceph + name: Ceph + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.ceph[ansible_distribution_major_version] | appliances_repo_to_subpath }}" dnf_repos_openhpc_repolist: - name: OpenHPC diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index c0b191336..c549dac53 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -26,6 +26,8 @@ pulp_site_rpm_info: subpath: "{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" - name: "ohpc-updates-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major].timestamp }}" subpath: "{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" +- name: "ceph-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" pulp_site_rpm_repo_defaults: remote_username: "{{ pulp_site_upstream_username }}" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 943a2dfbd..9c72b07ce 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250106-0916-f8603056", - "RL9": "openhpc-RL9-250106-0916-f8603056" + "RL8": "openhpc-RL8-250107-1534-b03caaf3", + "RL9": "openhpc-RL9-250107-1535-b03caaf3" } } diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index e052eb709..e26bc3018 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -165,4 +165,10 @@ appliances_pulp_repos: '9': path: OpenHPC/3/updates/EL_9 timestamp: 20241218T154614 - + ceph: + '8': + timestamp: 20231104T015751 + path: centos/8-stream/storage/x86_64/ceph-quincy + '9': + timestamp: 20240923T233036 + path: centos/9-stream/storage/x86_64/ceph-reef diff --git a/environments/common/inventory/group_vars/all/os-manila-mount.yml b/environments/common/inventory/group_vars/all/os-manila-mount.yml new file mode 100644 index 000000000..6b25d62cb --- /dev/null +++ b/environments/common/inventory/group_vars/all/os-manila-mount.yml @@ -0,0 +1,3 @@ +# Empty repo lists from stackhpc.ansible-role-os-manila-mount role defaults, as these repofiles are +# now generated by dnf_repos to allow injecting Ark creds: +os_manila_mount_ceph_rpm_repos: [] diff --git a/requirements.yml b/requirements.yml index 7e71bb904..71adbc6e5 100644 --- a/requirements.yml +++ b/requirements.yml @@ -21,7 +21,7 @@ roles: version: v3.1.5 - src: https://github.com/stackhpc/ansible-role-os-manila-mount.git name: stackhpc.os-manila-mount - version: v24.11.0 # Support ceph quincy for RL9 + version: v25.1.1 collections: - name: containers.podman From 781c2d474848309dbe42bb4ca83343b1aad3b621 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:45:03 +0000 Subject: [PATCH 151/182] Add more information re. configuring production sites (#508) * add lots of info to production docs * Production docs tweaks from review Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com> * add prod docs comment re login FIPs --------- Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com> --- docs/production.md | 150 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 145 insertions(+), 5 deletions(-) diff --git a/docs/production.md b/docs/production.md index 7219ee7fc..c1b139994 100644 --- a/docs/production.md +++ b/docs/production.md @@ -1,9 +1,149 @@ # Production Deployments -This page contains some brief notes about differences between the default/demo configuration, as described in the main [README.md](../README.md) and production-ready deployments. +This page contains some brief notes about differences between the default/demo +configuration (as described in the main [README.md](../README.md)) and +production-ready deployments. + +- Get it agreed up front what the cluster names will be. Changing this later + requires instance deletion/recreation. + +- At least three environments should be created: + - `site`: site-specific base environment + - `production`: production environment + - `staging`: staging environment + + A `dev` environment should also be created if considered required, or this + can be left until later., + + These can all be produced using the cookicutter instructions, but the + `production` and `staging` environments will need their + `environments/$ENV/ansible.cfg` file modifying so that they point to the + `site` environment: + + ```ini + inventory = ../common/inventory,../site/inventory,inventory + ``` + +- To avoid divergence of configuration all possible overrides for group/role +vars should be placed in `environments/site/inventory/group_vars/all/*.yml` +unless the value really is environment-specific (e.g. DNS names for +`openondemand_servername`). + +- Where possible hooks should also be placed in `environments/site/hooks/` +and referenced from the `site` and `production` environments, e.g.: + + ```yaml + # environments/production/hooks/pre.yml: + - name: Import parent hook + import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml" + ``` + +- OpenTofu configurations should be defined in the `site` environment and used + as a module from the other environments. This can be done with the + cookie-cutter generated configurations: + - Delete the *contents* of the cookie-cutter generated `terraform/` directories + from the `production` and `staging` environments. + - Create a `main.tf` in those directories which uses `site/terraform/` as a + [module](https://opentofu.org/docs/language/modules/), e.g. : + + ``` + ... + module "cluster" { + source = "../../site/terraform/" + + cluster_name = "foo" + ... + } + ``` + + Note that: + - Environment-specific variables (`cluster_name`) should be hardcoded + into the module block. + - Environment-independent variables (e.g. maybe `cluster_net` if the + same is used for staging and production) should be set as *defaults* + in `environments/site/terraform/variables.tf`, and then don't need to + be passed in to the module. + +- Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates + a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`. + To ensure staging environments are a good model for production this should + generally be moved into the `site` environment. It should be be encrypted + using [Ansible vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html) + and then committed to the repository. + +- Ensure created instances have accurate/synchronised time. For VM instances + this is usually provided by the hypervisor, but if not (or for bare metal + instances) it may be necessary to configure or proxy `chronyd` via an + environment hook. + +- The cookiecutter provided OpenTofu configurations define resources for home and + state volumes. The former may not be required if the cluster's `/home` is + provided from an external filesystem (or Manila). In any case, in at least + the production environment, and probably also in the staging environment, + the volumes should be manually created and the resources changed to [data + resources](https://opentofu.org/docs/language/data-sources/). This ensures that even if the cluster is deleted via tofu, the + volumes will persist. + + For a development environment, having volumes under tofu control via volume + resources is usually appropriate as there may be many instantiations + of this environment. + +- Enable `etc_hosts` templating: + + ```yaml + # environments/site/inventory/groups: + [etc_hosts:children] + cluster + ``` -- Create a site environment. Usually at least production, staging and possibly development environments are required. To avoid divergence of configuration these should all have an `inventory` path referencing a shared, site-specific base environment. Where possible hooks should also be placed in this site-specific environment. -- Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`. To ensure staging environments are a good model for production this should generally be moved into the site-specific environment. It can be be encrypted using [Ansible vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html) and then committed to the repository. -- Ensure created instances have accurate/synchronised time. For VM instances this is usually provided by the hypervisor, but if not (or for bare metal instances) it may be necessary to configure or proxy `chronyd` via an environment hook. -- Remove production volumes from OpenTofu control. In the default OpenTofu configuration, deleting the resources also deletes the volumes used for persistent state and home directories. This is usually undesirable for production, so these resources should be removed from the OpenTofu configurations and manually deployed once. However note that for development environments leaving them under OpenTofu control is usually best. - Configure Open OpenOndemand - see [specific documentation](openondemand.README.md). + +- Modify `environments/site/terraform/nodes.tf` to provide fixed IPs for at least + the control node, and (if not using FIPs) the login node(s): + + ``` + resource "openstack_networking_port_v2" "control" { + ... + fixed_ip { + subnet_id = data.openstack_networking_subnet_v2.cluster_subnet.id + ip_address = var.control_ip_address + } + } + ``` + + Note the variable `control_ip_address` is new. + + Using fixed IPs will require either using admin credentials or policy changes. + +- If floating IPs are required for login nodes, modify the OpenTofu configurations + appropriately. + +- Enable persisting login node hostkeys so users do not get annoying ssh warning + messages on reimage: + + ```yaml + # environments/site/inventory/groups: + [persist_hostkeys:children] + login + ``` + And configure NFS to include exporting the state directory to these hosts: + + ```yaml + # environments/common/inventory/group_vars/all/nfs.yml: + nfs_configurations: + # ... potentially, /home defintion from common environment + - comment: Export state directory to login nodes + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: "{{ inventory_hostname in groups['login'] }}" + nfs_server: "{{ nfs_server_default }}" + nfs_export: "/var/lib/state" + nfs_client_mnt_point: "/var/lib/state" + ``` + See [issue 506](https://github.com/stackhpc/ansible-slurm-appliance/issues/506). + +- Consider whether mapping of baremetal nodes to ironic nodes is required. See + [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). + +- Note [PR 473](https://github.com/stackhpc/ansible-slurm-appliance/pull/473) + may help identify any site-specific configuration. From a1e5bd7173f60735cf70270b16b2f169e81692f4 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 8 Jan 2025 14:40:52 +0000 Subject: [PATCH 152/182] Reworked persist_hostkeys role to use common set of persistent keys from state directory --- ansible/roles/persist_hostkeys/README.md | 7 +-- .../roles/persist_hostkeys/defaults/main.yml | 2 + ansible/roles/persist_hostkeys/tasks/main.yml | 63 +++++++++++-------- environments/common/layouts/everything | 6 +- 4 files changed, 46 insertions(+), 32 deletions(-) create mode 100644 ansible/roles/persist_hostkeys/defaults/main.yml diff --git a/ansible/roles/persist_hostkeys/README.md b/ansible/roles/persist_hostkeys/README.md index 2d823dc36..6201a104b 100644 --- a/ansible/roles/persist_hostkeys/README.md +++ b/ansible/roles/persist_hostkeys/README.md @@ -1,8 +1,5 @@ # persist_hostkeys -Save hostkeys to persistent storage and restore them after a rebuild/reimage. +Idempotently generates a persistent set of hostkeys and restores them after a rebuild/reimage. -Add hosts to the `persist_hostkeys` group to enable. - -This role has no variables but hosts in this group must have `appliances_state_dir` -defined as a directory they can write to on persistent storage. +Add hosts to the `persist_hostkeys` group to enable. All hosts in group will share the same set hostkeys. diff --git a/ansible/roles/persist_hostkeys/defaults/main.yml b/ansible/roles/persist_hostkeys/defaults/main.yml new file mode 100644 index 000000000..3c0000466 --- /dev/null +++ b/ansible/roles/persist_hostkeys/defaults/main.yml @@ -0,0 +1,2 @@ +persist_hostkeys_state_server: "{{ groups['control'] | first }}" +persist_hostkeys_state_dir: "{{ hostvars[persist_hostkeys_state_server]['appliances_state_dir'] }}/hostkeys" diff --git a/ansible/roles/persist_hostkeys/tasks/main.yml b/ansible/roles/persist_hostkeys/tasks/main.yml index 47493220d..8bb2d6306 100644 --- a/ansible/roles/persist_hostkeys/tasks/main.yml +++ b/ansible/roles/persist_hostkeys/tasks/main.yml @@ -1,33 +1,46 @@ --- -- name: Ensure hostkeys directory exists on persistent storage - file: - path: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}" - state: directory - owner: root - group: root - mode: 0600 +- name: Generate persistent hostkeys in state directory + delegate_to: "{{ persist_hostkeys_state_server }}" + block: + - name: Ensure hostkeys directory exists on persistent storage + file: + path: "{{ persist_hostkeys_state_dir }}" + state: directory + owner: root + group: root + mode: 0600 -- name: Copy hostkeys from persistent storage - # won't fail if no keys are in persistent storage - copy: - src: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}/" - dest: /etc/ssh/ - remote_src: true + - name: Check for existing hostkeys + find: + paths: "{{ persist_hostkeys_state_dir }}/" + register: _files_found + + - name: Generate hostkeys + when: _files_found.matched == 0 + shell: + cmd: | + mkdir -p {{ persist_hostkeys_state_dir }}/etc/ssh + ssh-keygen -A -N \"\" -f {{ persist_hostkeys_state_dir }} + mv {{ persist_hostkeys_state_dir }}/etc/ssh/* {{ persist_hostkeys_state_dir }} + rm -rf {{ persist_hostkeys_state_dir }}/etc/ssh + + - name: Get created key names + find: + path: "{{ persist_hostkeys_state_dir }}/" + register: _find_ssh_keys -- name: Find hostkeys - find: - path: /etc/ssh/ - patterns: ssh_host_*_key* - register: _find_ssh_keys + - name: Create in-memory copies of keys + ansible.builtin.slurp: + src: "{{ item.path }}" + loop: "{{ _find_ssh_keys.files }}" + register: _slurp_keys -- name: Persist hostkeys +- name: Copy keys to hosts + no_log: true copy: - dest: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}/" - src: "{{ item }}" - remote_src: true - mode: preserve - loop: "{{ _find_ssh_keys.files | map(attribute='path') }}" + content: "{{ item.content | b64decode }}" + dest: "/etc/ssh/{{ item.source | regex_search('[^/]+$') }}" + loop: "{{ _slurp_keys.results }}" - meta: reset_connection - diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 878bebbf3..0d3c57ad5 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -69,8 +69,10 @@ openhpc [manila] # Hosts to configure for manila fileshares -[persist_hostkeys] -# Hosts to persist hostkeys for across reimaging. NB: Requires appliances_state_dir on hosts. +[persist_hostkeys:children] +# Hosts to use common set of hostkeys which persist across reimaging. +login +openondemand [squid] # Hosts to run squid proxy From fa028f9acc986372e0dcd2b9f0d949fc0066c19b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 8 Jan 2025 14:44:23 +0000 Subject: [PATCH 153/182] removed unnescessary caas config --- environments/.caas/inventory/extra_groups | 3 --- environments/.caas/inventory/group_vars/all/nfs.yml | 11 +---------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/environments/.caas/inventory/extra_groups b/environments/.caas/inventory/extra_groups index d60ae7839..45a1dc7aa 100644 --- a/environments/.caas/inventory/extra_groups +++ b/environments/.caas/inventory/extra_groups @@ -14,6 +14,3 @@ compute [podman:children] zenith - -[persist_hostkeys:children] -openondemand diff --git a/environments/.caas/inventory/group_vars/all/nfs.yml b/environments/.caas/inventory/group_vars/all/nfs.yml index 14fff6295..f42422601 100644 --- a/environments/.caas/inventory/group_vars/all/nfs.yml +++ b/environments/.caas/inventory/group_vars/all/nfs.yml @@ -1,14 +1,5 @@ nfs_server: "{{ nfs_server_default }}" -caas_nfs_ood_state: - - comment: Export /var/lib/state from Slurm control node to OOD - nfs_enable: - server: "{{ inventory_hostname in groups['control'] }}" - clients: "{{ inventory_hostname in groups['openondemand'] }}" - nfs_export: "{{ appliances_state_dir }}" - nfs_client_mnt_point: "{{ appliances_state_dir }}" - nfs_client_mnt_options: "x-systemd.required-by=zenith-ood.service,x-systemd.before=zenith-ood.service" - caas_nfs_home: - comment: Export /exports/home from Slurm control node as /home nfs_enable: @@ -17,4 +8,4 @@ caas_nfs_home: nfs_export: "/exports/home" # assumes skeleton TF is being used nfs_client_mnt_point: "/home" -nfs_configurations: "{{ caas_nfs_ood_state + (caas_nfs_home if not cluster_home_manila_share | bool else []) }}" +nfs_configurations: "{{ caas_nfs_home if not cluster_home_manila_share | bool else [] }}" From 001c459cf65f156763abe53e36588ee26121ef42 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 8 Jan 2025 14:51:57 +0000 Subject: [PATCH 154/182] updated docs --- docs/production.md | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/docs/production.md b/docs/production.md index c1b139994..e767c61b4 100644 --- a/docs/production.md +++ b/docs/production.md @@ -118,30 +118,6 @@ and referenced from the `site` and `production` environments, e.g.: - If floating IPs are required for login nodes, modify the OpenTofu configurations appropriately. -- Enable persisting login node hostkeys so users do not get annoying ssh warning - messages on reimage: - - ```yaml - # environments/site/inventory/groups: - [persist_hostkeys:children] - login - ``` - And configure NFS to include exporting the state directory to these hosts: - - ```yaml - # environments/common/inventory/group_vars/all/nfs.yml: - nfs_configurations: - # ... potentially, /home defintion from common environment - - comment: Export state directory to login nodes - nfs_enable: - server: "{{ inventory_hostname in groups['control'] }}" - clients: "{{ inventory_hostname in groups['login'] }}" - nfs_server: "{{ nfs_server_default }}" - nfs_export: "/var/lib/state" - nfs_client_mnt_point: "/var/lib/state" - ``` - See [issue 506](https://github.com/stackhpc/ansible-slurm-appliance/issues/506). - - Consider whether mapping of baremetal nodes to ironic nodes is required. See [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). From 2bea51cdb0ec0cb32471372e908238b82f581c16 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 8 Jan 2025 16:03:13 +0000 Subject: [PATCH 155/182] review suggestions --- .github/workflows/stackhpc.yml | 1 - ansible/roles/compute_init/README.md | 26 +--- docs/experimental/compute-init.md | 111 ++---------------- .../terraform/compute_init.auto.tfvars | 7 -- environments/.stackhpc/terraform/main.tf | 6 +- .../terraform/compute.tf | 2 +- .../terraform/compute/nodes.tf | 20 ++-- .../terraform/variables.tf | 8 +- 8 files changed, 21 insertions(+), 160 deletions(-) delete mode 100644 environments/.stackhpc/terraform/compute_init.auto.tfvars diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index a5267e508..ea18a2274 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -183,7 +183,6 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml - ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down ansible-playbook -v ansible/ci/check_slurm.yml - name: Check sacct state survived reimage diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 40d9b7326..db18034aa 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -42,30 +42,8 @@ The following roles/groups are currently fully functional: node and all compute nodes. - `openhpc`: all functionality -All of the above are defined in the skeleton cookiecutter config, and are -toggleable via a terraform compute_init autovar file. In the .stackhpc -environment, the compute init roles are set by default to: -- `enable_compute`: This encompasses the openhpc role functionality while being - a global toggle for the entire compute-init script. -- `etc_hosts` -- `nfs` -- `basic_users` -- `eessi` - -# CI workflow - -The compute node rebuild is tested in CI after the tests for rebuilding the -login and control nodes. The process follows - -1. Compute nodes are reimaged: - - ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml - -2. Ansible-init runs against newly reimaged compute nodes - -3. Run sinfo and check nodes have expected slurm state - - ansible-playbook -v ansible/ci/check_slurm.yml +The above may be enabled by setting the compute_init_enable property on the +terraform compute variable. # Development/debugging diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index dae840d95..c7c1d4d8c 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -2,112 +2,17 @@ See the role README.md -# Results/progress +# CI workflow -Without any metadata: +The compute node rebuild is tested in CI after the tests for rebuilding the +login and control nodes. The process follows - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: activating (start) since Fri 2024-12-13 20:41:16 UTC; 1min 45s ago - Main PID: 16089 (ansible-init) - Tasks: 8 (limit: 10912) - Memory: 99.5M - CPU: 11.687s - CGroup: /system.slice/ansible-init.service - ├─16089 /usr/lib/ansible-init/bin/python /usr/bin/ansible-init - ├─16273 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml - ├─16350 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml - ├─16361 /bin/sh -c "/usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py && sleep 0" - ├─16362 /usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py - ├─16363 /usr/bin/mount /mnt/cluster - └─16364 /sbin/mount.nfs 192.168.10.12:/exports/cluster /mnt/cluster -o ro,sync +1. Compute nodes are reimaged: - Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] - Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Report skipping initialization if not compute node] ********************** - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Ensure the mount directory exists] *************************************** - Dec 13 20:41:25 rl9-compute-0.rl9.invalid python3[16346]: ansible-file Invoked with path=/mnt/cluster state=directory owner=root group=root mode=u=rwX,go= recurse=False force=False follow=True modification_time_format=%Y%m%d%H%M.%S access> - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: changed: [127.0.0.1] - Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Mount /mnt/cluster] ****************************************************** - Dec 13 20:41:26 rl9-compute-0.rl9.invalid python3[16362]: ansible-mount Invoked with path=/mnt/cluster src=192.168.10.12:/exports/cluster fstype=nfs opts=ro,sync state=mounted boot=True dump=0 passno=0 backup=False fstab=None - [root@rl9-compute-0 rocky]# systemctl status ansible-init + ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml -Added metadata via horizon: +2. Ansible-init runs against newly reimaged compute nodes - compute_groups ["compute"] +3. Run sinfo and check nodes have expected slurm state - -OK: - - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: active (exited) since Fri 2024-12-13 20:43:31 UTC; 33s ago - Process: 16089 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) - Main PID: 16089 (code=exited, status=0/SUCCESS) - CPU: 13.003s - - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] => { - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: "msg": "Skipping compute initialization as cannot mount exports/cluster share" - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: } - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: PLAY RECAP ********************************************************************* - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: 127.0.0.1 : ok=4 changed=1 unreachable=0 failed=0 skipped=1 rescued=0 ignored=1 - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] executing remote playbooks for stage - post - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] writing sentinel file /var/lib/ansible-init.done - Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] ansible-init completed successfully - Dec 13 20:43:31 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. - -Now run site.yml, then restart ansible-init again: - - - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: active (exited) since Fri 2024-12-13 20:50:10 UTC; 11s ago - Process: 18921 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) - Main PID: 18921 (code=exited, status=0/SUCCESS) - CPU: 8.240s - - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [Report skipping initialization if cannot mount nfs] ********************** - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [meta] ******************************************************************** - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: PLAY RECAP ********************************************************************* - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: 127.0.0.1 : ok=3 changed=1 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] executing remote playbooks for stage - post - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] writing sentinel file /var/lib/ansible-init.done - Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] ansible-init completed successfully - Dec 13 20:50:10 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. - [root@rl9-compute-0 rocky]# ls /mnt/cluster/host - hosts hostvars/ - [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- - rl9-compute-0/ rl9-compute-1/ - [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- - rl9-compute-0/ rl9-compute-1/ - [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-0/ - hostvars.yml - -This commit - shows that hostvars have loaded: - - [root@rl9-compute-0 rocky]# systemctl status ansible-init - ● ansible-init.service - Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) - Active: active (exited) since Fri 2024-12-13 21:06:20 UTC; 5s ago - Process: 27585 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) - Main PID: 27585 (code=exited, status=0/SUCCESS) - CPU: 8.161s - - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: TASK [Demonstrate hostvars have loaded] **************************************** - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: ok: [127.0.0.1] => { - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: "prometheus_version": "2.27.0" - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: } - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: PLAY RECAP ********************************************************************* - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: 127.0.0.1 : ok=5 changed=0 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] executing remote playbooks for stage - post - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] writing sentinel file /var/lib/ansible-init.done - Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] ansible-init completed successfully - Dec 13 21:06:20 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. + ansible-playbook -v ansible/ci/check_slurm.yml \ No newline at end of file diff --git a/environments/.stackhpc/terraform/compute_init.auto.tfvars b/environments/.stackhpc/terraform/compute_init.auto.tfvars deleted file mode 100644 index 032ae5adb..000000000 --- a/environments/.stackhpc/terraform/compute_init.auto.tfvars +++ /dev/null @@ -1,7 +0,0 @@ -compute_init_enable = [ - "compute", - "etc_hosts", - "nfs", - "basic_users", - "eessi" -] diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index d54903cc4..872003db3 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -58,10 +58,6 @@ variable "k3s_token" { type = string } -variable "compute_init_enable" { - type = list(string) -} - data "openstack_images_image_v2" "cluster" { name = var.cluster_image[var.os_version] most_recent = true @@ -78,7 +74,6 @@ module "cluster" { cluster_image_id = data.openstack_images_image_v2.cluster.id control_node_flavor = var.control_node_flavor k3s_token = var.k3s_token - compute_init_enable = var.compute_init_enable login_nodes = { login-0: var.other_node_flavor @@ -87,6 +82,7 @@ module "cluster" { standard: { # NB: can't call this default! nodes: ["compute-0", "compute-1"] flavor: var.other_node_flavor + compute_init_enable: ["compute", "etc_hosts", "nfs", "basic_users", "eessi"] } # Example of how to add another partition: # extra: { diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index d52c3c42c..dcc692c1a 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -19,5 +19,5 @@ module "compute" { control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] - compute_init_enable = var.compute_init_enable + compute_init_enable = each.value.compute_init_enable } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index ac34a443c..d3a37bc5b 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -44,18 +44,14 @@ resource "openstack_compute_instance_v2" "compute" { access_network = true } - metadata = { - environment_root = var.environment_root - k3s_token = var.k3s_token - control_address = var.control_address - enable_compute = contains(var.compute_init_enable, "compute") - enable_resolv_conf = contains(var.compute_init_enable, "resolv_conf") - enable_etc_hosts = contains(var.compute_init_enable, "etc_hosts") - enable_nfs = contains(var.compute_init_enable, "nfs") - enable_manila = contains(var.compute_init_enable, "manila") - enable_basic_users = contains(var.compute_init_enable, "basic_users") - enable_eessi = contains(var.compute_init_enable, "eessi") - } + metadata = merge( + { + environment_root = var.environment_root + k3s_token = var.k3s_token + control_address = var.control_address + }, + {for e in var.compute_init_enable: e => true} + ) user_data = <<-EOF #cloud-config diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 19027dd19..b2e16c942 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -52,7 +52,7 @@ variable "compute" { image_id: Overrides variable cluster_image_id vnic_type: Overrides variable vnic_type vnic_profile: Overrides variable vnic_profile - compute_init_enable: Toggles ansible-init rebuild + compute_init_enable: Toggles compute-init rebuild (see compute-init role docs) EOF } @@ -136,10 +136,4 @@ variable "root_volume_size" { variable "k3s_token" { description = "K3s cluster authentication token, set automatically by Ansible" type = string -} - -variable "compute_init_enable" { - type = list(string) - description = "Groups to activate for ansible-init compute rebuilds" - default = [] } \ No newline at end of file From dc58a257499f4e6653664f78e2853034c15f3101 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 8 Jan 2025 16:58:23 +0000 Subject: [PATCH 156/182] Change defaults so a cookiecutter environment is fully functional (#473) * cookiecutter environment now has working defaults * updated docs * refactored ood demo user into cookiecutter * updated docs * changed secret name * Doc changes Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * rename * replaced testuser with demo_user * selinux now defaults to disabled * bump images * updated readme * moved files and removed redundant ood config * environments now have grafana anonymous auth by default * docs update Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --------- Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- .github/workflows/stackhpc.yml | 10 +++++----- README.md | 1 + ansible/roles/passwords/defaults/main.yml | 1 + ansible/roles/passwords/tasks/validate.yml | 2 +- docs/{openondemand.README.md => openondemand.md} | 12 +++++++----- docs/production.md | 4 ++++ .../.caas/inventory/group_vars/all/selinux.yml | 1 - .../inventory/group_vars/all/basic_users.yml | 6 +++--- .../.stackhpc/inventory/group_vars/all/freeipa.yml | 4 ++-- .../{grafana/overrides.yml => all/grafana.yml} | 0 .../{openhpc/overrides.yml => all/openhpc.yml} | 0 .../inventory/group_vars/all/openondemand.yml | 9 ++++++++- .../inventory/group_vars/openondemand/overrides.yml | 8 -------- .../inventory/group_vars/selinux/overrides.yml | 1 - .../common/inventory/group_vars/all/openondemand.yml | 7 ++++++- .../common/inventory/group_vars/all/selinux.yml | 2 +- environments/common/layouts/everything | 6 ++++-- .../inventory/group_vars/all/basic_users.yml | 4 ++++ .../inventory/group_vars/all/grafana.yml | 1 + .../terraform/variables.tf | 2 +- 20 files changed, 49 insertions(+), 32 deletions(-) rename docs/{openondemand.README.md => openondemand.md} (76%) delete mode 100644 environments/.caas/inventory/group_vars/all/selinux.yml rename environments/.stackhpc/inventory/group_vars/{grafana/overrides.yml => all/grafana.yml} (100%) rename environments/.stackhpc/inventory/group_vars/{openhpc/overrides.yml => all/openhpc.yml} (100%) delete mode 100644 environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml delete mode 100644 environments/.stackhpc/inventory/group_vars/selinux/overrides.yml create mode 100644 environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml create mode 100644 environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b08854adb..eaca3a3ae 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -99,9 +99,9 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible-playbook ansible/adhoc/generate-passwords.yml - echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml + echo vault_demo_user_password: "$DEMO_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml env: - TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} + DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} - name: Provision nodes using fat image id: provision_servers @@ -163,12 +163,12 @@ jobs: --spider \ --server-response \ --no-check-certificate \ - --http-user=testuser \ - --http-password=${TESTUSER_PASSWORD} https://${openondemand_servername} \ + --http-user=demo_user \ + --http-password=${DEMO_USER_PASSWORD} https://${openondemand_servername} \ 2>&1) (echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1) env: - TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} + DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} # - name: Build environment-specific compute image # id: packer_build diff --git a/README.md b/README.md index f66441915..593837ccd 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,7 @@ To deploy this infrastructure, ensure the venv and the environment are [activate export OS_CLOUD=openstack cd environments/$ENV/terraform/ + tofu init tofu apply and follow the prompts. Note the OS_CLOUD environment variable assumes that OpenStack credentials are defined using a [clouds.yaml](https://docs.openstack.org/python-openstackclient/latest/configuration/index.html#clouds-yaml) file in a default location with the default cloud name of `openstack`. diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index 2587e8499..929aac465 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -10,6 +10,7 @@ slurm_appliance_secrets: vault_freeipa_admin_password: "{{ vault_freeipa_admin_password | default(lookup('password', '/dev/null')) }}" vault_k3s_token: "{{ vault_k3s_token | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}" vault_pulp_admin_password: "{{ vault_pulp_admin_password | default(lookup('password', '/dev/null', chars=['ascii_letters', 'digits'])) }}" + vault_demo_user_password: "{{ vault_demo_user_password | default(lookup('password', '/dev/null')) }}" secrets_openhpc_mungekey_default: content: "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') }}" diff --git a/ansible/roles/passwords/tasks/validate.yml b/ansible/roles/passwords/tasks/validate.yml index 9279ffdbf..b30b0696e 100644 --- a/ansible/roles/passwords/tasks/validate.yml +++ b/ansible/roles/passwords/tasks/validate.yml @@ -1,4 +1,4 @@ - name: Assert secrets created assert: - that: (hostvars[inventory_hostname].keys() | select('contains', 'vault_') | length) > 1 # 1 as may have vault_testuser_password defined in dev + that: (hostvars[inventory_hostname].keys() | select('contains', 'vault_') | length) > 1 # 1 as may have vault_demo_user_password defined in dev fail_msg: "No inventory variables 'vault_*' found: Has ansible/adhoc/generate-passwords.yml been run?" diff --git a/docs/openondemand.README.md b/docs/openondemand.md similarity index 76% rename from docs/openondemand.README.md rename to docs/openondemand.md index 5daba3408..3bd6c9e9f 100644 --- a/docs/openondemand.README.md +++ b/docs/openondemand.md @@ -30,11 +30,10 @@ The above functionality is configured by running the `ansible/portal.yml` playbo See the [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md) for more details on the variables described below. -At minimum the following must be defined: -- `openondemand_servername` - this must be defined for both `openondemand` and `grafana` hosts (when Grafana is enabled). It is suggested to place it groupvars for `all`. -- `openondemand_auth` and any corresponding options. -- `openondemand_desktop_partition` and `openondemand_jupyter_partition` if the corresponding inventory groups are defined. -- `openondemand_host_regex` if `openondemand_desktop` or `openondemand_jupyter` inventory groups are defined and/or proxying Grafana via Open Ondemand is required. +The following variables have been given default values to allow Open Ondemand to work in a newly created environment without additional configuration, but generally should be overridden in `environment/site/inventory/group_vars/all/` with site-specific values: +- `openondemand_servername` - this must be defined for both `openondemand` and `grafana` hosts (when Grafana is enabled). Default is `ansible_host` (i.e. the IP address) of the first host in the `openondemand` group. +- `openondemand_auth` and any corresponding options. Defaults to `basic_pam`. +- `openondemand_desktop_partition` and `openondemand_jupyter_partition` if the corresponding inventory groups are defined. Defaults to the first compute group defined in the `compute` Terraform variable in `environments/$ENV/terraform`. It is also recommended to set: - `openondemand_dashboard_support_url` @@ -45,3 +44,6 @@ If shared filesystems other than `$HOME` are available, add paths to `openondema The appliance automatically configures Open Ondemand to proxy Grafana and adds a link to it on the Open Ondemand dashboard. This means no external IP (or SSH proxying etc) is required to access Grafana (which by default is deployed on the control node). To allow users to authenticate to Grafana, the simplest option is to enable anonymous (View-only) login by setting `grafana_auth_anonymous` (see [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml)[^1]). [^1]: Note that if `openondemand_auth` is `basic_pam` and anonymous Grafana login is enabled, the appliance will (by default) configure Open Ondemand's Apache server to remove the Authorisation header from proxying of all `node/` addresses. This is done as otherwise Grafana tries to use this header to authenticate, which fails with the default configuration where only the admin Grafana user `grafana` is created. Note that the removal of this header in this configuration means it cannot be used to authenticate proxied interactive applications - however the appliance-deployed remote desktop and Jupyter Notebook server applications use other authentication methods. An alternative if using `basic_pam` is not to enable anonymous Grafana login and to create Grafana users matching the local users (e.g. in `environments//hooks/post.yml`). + +# Access +By default the appliance authenticates against OOD with basic auth through PAM. When creating a new environment, a new user with username `demo_user` will be created. Its password is found under `vault_openondemand_default_user` in the appliance secrets store in `environments/{ENV}/inventory/group_vars/all/secrets.yml`. Other users can be defined by overriding the `basic_users_users` variable in your environment (templated into `environments/{ENV}/inventory/group_vars/all/basic_users.yml` by default). diff --git a/docs/production.md b/docs/production.md index c1b139994..5190ecae6 100644 --- a/docs/production.md +++ b/docs/production.md @@ -98,6 +98,10 @@ and referenced from the `site` and `production` environments, e.g.: - Configure Open OpenOndemand - see [specific documentation](openondemand.README.md). +- Remove the `demo_user` user from `environments/$ENV/inventory/group_vars/all/basic_users.yml` + +- Consider whether having (read-only) access to Grafana without login is OK. If not, remove `grafana_auth_anonymous` in `environments/$ENV/inventory/group_vars/all/grafana.yml` + - Modify `environments/site/terraform/nodes.tf` to provide fixed IPs for at least the control node, and (if not using FIPs) the login node(s): diff --git a/environments/.caas/inventory/group_vars/all/selinux.yml b/environments/.caas/inventory/group_vars/all/selinux.yml deleted file mode 100644 index 1f1098126..000000000 --- a/environments/.caas/inventory/group_vars/all/selinux.yml +++ /dev/null @@ -1 +0,0 @@ -selinux_state: disabled \ No newline at end of file diff --git a/environments/.stackhpc/inventory/group_vars/all/basic_users.yml b/environments/.stackhpc/inventory/group_vars/all/basic_users.yml index ae416cf72..e2088ffd9 100644 --- a/environments/.stackhpc/inventory/group_vars/all/basic_users.yml +++ b/environments/.stackhpc/inventory/group_vars/all/basic_users.yml @@ -1,6 +1,6 @@ -test_user_password: "{{ lookup('env', 'TESTUSER_PASSWORD') | default(vault_testuser_password, true) }}" # CI uses env, debug can set vault_testuser_password +test_demo_user_password: "{{ lookup('env', 'DEMO_USER_PASSWORD') | default(vault_demo_user_password, true) }}" # CI uses env, debug can set vault_demo_user_password basic_users_users: - - name: testuser # can't use rocky as $HOME isn't shared! - password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent + - name: demo_user # can't use rocky as $HOME isn't shared! + password: "{{ test_demo_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent uid: 1005 diff --git a/environments/.stackhpc/inventory/group_vars/all/freeipa.yml b/environments/.stackhpc/inventory/group_vars/all/freeipa.yml index 4b3750650..9a979ab16 100644 --- a/environments/.stackhpc/inventory/group_vars/all/freeipa.yml +++ b/environments/.stackhpc/inventory/group_vars/all/freeipa.yml @@ -2,8 +2,8 @@ # NB: Users defined this way have expired passwords freeipa_users: - - name: testuser # can't use rocky as $HOME isn't shared! - password: "{{ test_user_password }}" + - name: demo_user # can't use rocky as $HOME isn't shared! + password: "{{ test_demo_user_password }}" givenname: test sn: test diff --git a/environments/.stackhpc/inventory/group_vars/grafana/overrides.yml b/environments/.stackhpc/inventory/group_vars/all/grafana.yml similarity index 100% rename from environments/.stackhpc/inventory/group_vars/grafana/overrides.yml rename to environments/.stackhpc/inventory/group_vars/all/grafana.yml diff --git a/environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml b/environments/.stackhpc/inventory/group_vars/all/openhpc.yml similarity index 100% rename from environments/.stackhpc/inventory/group_vars/openhpc/overrides.yml rename to environments/.stackhpc/inventory/group_vars/all/openhpc.yml diff --git a/environments/.stackhpc/inventory/group_vars/all/openondemand.yml b/environments/.stackhpc/inventory/group_vars/all/openondemand.yml index 11d475664..72b6cf476 100644 --- a/environments/.stackhpc/inventory/group_vars/all/openondemand.yml +++ b/environments/.stackhpc/inventory/group_vars/all/openondemand.yml @@ -1 +1,8 @@ -openondemand_servername: "{{ hostvars[ groups['openondemand'] | first].ansible_host }}" # Use a SOCKS proxy to acccess +openondemand_auth: basic_pam +openondemand_jupyter_partition: standard +openondemand_desktop_partition: standard +#openondemand_dashboard_support_url: +#openondemand_dashboard_docs_url: +#openondemand_filesapp_paths: +ondemand_package: ondemand-"{{ ondemand_package_version }}" +ondemand_package_version: '3.1.10' diff --git a/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml b/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml deleted file mode 100644 index 72b6cf476..000000000 --- a/environments/.stackhpc/inventory/group_vars/openondemand/overrides.yml +++ /dev/null @@ -1,8 +0,0 @@ -openondemand_auth: basic_pam -openondemand_jupyter_partition: standard -openondemand_desktop_partition: standard -#openondemand_dashboard_support_url: -#openondemand_dashboard_docs_url: -#openondemand_filesapp_paths: -ondemand_package: ondemand-"{{ ondemand_package_version }}" -ondemand_package_version: '3.1.10' diff --git a/environments/.stackhpc/inventory/group_vars/selinux/overrides.yml b/environments/.stackhpc/inventory/group_vars/selinux/overrides.yml deleted file mode 100644 index c3b28b913..000000000 --- a/environments/.stackhpc/inventory/group_vars/selinux/overrides.yml +++ /dev/null @@ -1 +0,0 @@ -selinux_state: disabled diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 5e85392ca..cce923fcc 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -5,7 +5,12 @@ # NB: Variables prefixed ood_ are all from https://github.com/OSC/ood-ansible -# openondemand_servername: '' # Must be defined when using openondemand +openondemand_servername: "{{ hostvars[groups['openondemand'].0].ansible_host if groups['openondemand'] else '' }}" + +openondemand_auth: basic_pam + +openondemand_jupyter_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" +openondemand_desktop_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" # Regex defining hosts which openondemand can proxy; the default regex is compute nodes (for apps) and grafana host, # e.g. if the group `compute` has hosts `compute-{0,1,2,..}` this will be '(compute-\d+)|(control)'. diff --git a/environments/common/inventory/group_vars/all/selinux.yml b/environments/common/inventory/group_vars/all/selinux.yml index 25fbbd68f..fef5c3f58 100644 --- a/environments/common/inventory/group_vars/all/selinux.yml +++ b/environments/common/inventory/group_vars/all/selinux.yml @@ -1,4 +1,4 @@ --- -selinux_state: permissive +selinux_state: disabled selinux_policy: targeted diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 878bebbf3..ad9fa536a 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -36,8 +36,9 @@ login [block_devices:children] # Environment-specific so not defined here -[basic_users] +[basic_users:children] # Add `openhpc` group to add Slurm users via creation of users on each node. +openhpc [openondemand:children] # Host to run Open Ondemand server on - subset of login @@ -51,8 +52,9 @@ compute # Subset of compute to run a Jupyter Notebook servers on via Open Ondemand compute -[etc_hosts] +[etc_hosts:children] # Hosts to manage /etc/hosts e.g. if no internal DNS. See ansible/roles/etc_hosts/README.md +cluster [cuda] # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml new file mode 100644 index 000000000..dc993c3b8 --- /dev/null +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml @@ -0,0 +1,4 @@ +basic_users_users: + - name: demo_user + password: "{% raw %}{{ vault_demo_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}{% endraw %}" # idempotent + uid: 1005 diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml new file mode 100644 index 000000000..521616a1b --- /dev/null +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml @@ -0,0 +1 @@ +grafana_auth_anonymous: true \ No newline at end of file diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 0f5eefa18..0a5dde56b 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -6,7 +6,7 @@ variable "cluster_name" { variable "cluster_domain_suffix" { type = string description = "Domain suffix for cluster" - default = "invalid" + default = "internal" } variable "cluster_net" { From 038ddf744a0d4dc9e79b3d84620bff97fbf71b21 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 9 Jan 2025 09:01:22 +0000 Subject: [PATCH 157/182] add delay for ansible-init to finish --- .github/workflows/stackhpc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index ea18a2274..4d0fbb9bb 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -183,6 +183,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml + ansible all -m wait_for_connection -a 'delay=60 timeout=600' ansible-playbook -v ansible/ci/check_slurm.yml - name: Check sacct state survived reimage From 69d9cd859fec84a9a1d337fc4d2c5ec8c47e9c85 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 9 Jan 2025 10:41:44 +0000 Subject: [PATCH 158/182] Fix epel not using Ark repos for RL8 (#526) * cookiecutter environment now has working defaults * updated docs * refactored ood demo user into cookiecutter * updated docs * changed secret name * Doc changes Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * rename * replaced testuser with demo_user * selinux now defaults to disabled * bump images * updated readme * moved files and removed redundant ood config * environments now have grafana anonymous auth by default * fixed ohpc not using ark repos * bump images --------- Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- .../.stackhpc/inventory/group_vars/all/openhpc.yml | 10 ---------- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/environments/.stackhpc/inventory/group_vars/all/openhpc.yml b/environments/.stackhpc/inventory/group_vars/all/openhpc.yml index 858dfd9d3..5aac5f8ad 100644 --- a/environments/.stackhpc/inventory/group_vars/all/openhpc.yml +++ b/environments/.stackhpc/inventory/group_vars/all/openhpc.yml @@ -1,13 +1,3 @@ openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug - -ohpc_default_extra_repos: - "9": [] #overriding to ensure doesn't overwrite ark epel repo - "8": - - name: epel - file: epel - description: "Extra Packages for Enterprise Linux 8 - $basearch" - metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-8&arch=$basearch&infra=$infra&content=$contentdir" - gpgcheck: true - gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 9c72b07ce..47681ea8a 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250107-1534-b03caaf3", - "RL9": "openhpc-RL9-250107-1535-b03caaf3" + "RL8": "openhpc-RL8-250108-1703-e515b902", + "RL9": "openhpc-RL9-250108-1703-e515b902" } } From 6929272292f0ed7675dfbe961eedf15b3042569d Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:37:26 +0000 Subject: [PATCH 159/182] fix volume_backed_instances not working for compute nodes (#527) --- .../{{cookiecutter.environment}}/terraform/compute.tf | 8 +++++++- .../{{cookiecutter.environment}}/terraform/variables.tf | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index 14c728a5a..37c9aad10 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -3,16 +3,22 @@ module "compute" { for_each = var.compute + # must be set for group: nodes = each.value.nodes + flavor = each.value.flavor + cluster_name = var.cluster_name cluster_domain_suffix = var.cluster_domain_suffix cluster_net_id = data.openstack_networking_network_v2.cluster_net.id cluster_subnet_id = data.openstack_networking_subnet_v2.cluster_subnet.id - flavor = each.value.flavor + # can be set for group, defaults to top-level value: image_id = lookup(each.value, "image_id", var.cluster_image_id) vnic_type = lookup(each.value, "vnic_type", var.vnic_type) vnic_profile = lookup(each.value, "vnic_profile", var.vnic_profile) + volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) + root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) + key_pair = var.key_pair environment_root = var.environment_root k3s_token = var.k3s_token diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index 0a5dde56b..f2cfe1215 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -52,6 +52,8 @@ variable "compute" { image_id: Overrides variable cluster_image_id vnic_type: Overrides variable vnic_type vnic_profile: Overrides variable vnic_profile + volume_backed_instances: Overrides variable volume_backed_instances + root_volume_size: Overrides variable root_volume_size EOF } From 4652c34c4fb365cc8a1d2628cf6299957017efc8 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:48:09 +0000 Subject: [PATCH 160/182] typo --- ansible/roles/persist_hostkeys/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/persist_hostkeys/tasks/main.yml b/ansible/roles/persist_hostkeys/tasks/main.yml index 8bb2d6306..716b09146 100644 --- a/ansible/roles/persist_hostkeys/tasks/main.yml +++ b/ansible/roles/persist_hostkeys/tasks/main.yml @@ -21,7 +21,7 @@ shell: cmd: | mkdir -p {{ persist_hostkeys_state_dir }}/etc/ssh - ssh-keygen -A -N \"\" -f {{ persist_hostkeys_state_dir }} + ssh-keygen -A -N '' -f {{ persist_hostkeys_state_dir }} mv {{ persist_hostkeys_state_dir }}/etc/ssh/* {{ persist_hostkeys_state_dir }} rm -rf {{ persist_hostkeys_state_dir }}/etc/ssh From f021167e7970e23ceef94a0215b0c773a92edda7 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:49:11 +0000 Subject: [PATCH 161/182] comment update Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/persist_hostkeys/tasks/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/persist_hostkeys/tasks/main.yml b/ansible/roles/persist_hostkeys/tasks/main.yml index 716b09146..deff112f7 100644 --- a/ansible/roles/persist_hostkeys/tasks/main.yml +++ b/ansible/roles/persist_hostkeys/tasks/main.yml @@ -19,6 +19,7 @@ - name: Generate hostkeys when: _files_found.matched == 0 shell: + # ssh-keygen -A needs a directory with an /etc/ssh suffix to write hostkeys into cmd: | mkdir -p {{ persist_hostkeys_state_dir }}/etc/ssh ssh-keygen -A -N '' -f {{ persist_hostkeys_state_dir }} From 7057c5090cd918c99d6339ba60a71eede8e5a004 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 9 Jan 2025 12:04:29 +0000 Subject: [PATCH 162/182] remove delay in compute node rebuild ci --- .github/workflows/stackhpc.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b2651af2f..d5bd313ca 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -183,7 +183,6 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml - ansible all -m wait_for_connection -a 'delay=60 timeout=600' ansible-playbook -v ansible/ci/check_slurm.yml - name: Check sacct state survived reimage From 3faa81382941174657f0b2a8c9cf35f135c9debc Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 9 Jan 2025 14:25:50 +0000 Subject: [PATCH 163/182] fix compute init metadata flags --- ansible/roles/compute_init/files/compute-init.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index c7a9048b4..430e2cf65 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -6,13 +6,13 @@ vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" server_node_ip: "{{ os_metadata.meta.control_address }}" - enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}" - enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}" - enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" - enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}" - enable_manila: "{{ os_metadata.meta.enable_manila | default(false) | bool }}" - enable_basic_users: "{{ os_metadata.meta.enable_basic_users | default(false) | bool }}" - enable_eessi: "{{ os_metadata.meta.enable_eessi | default(false) | bool }}" + enable_compute: "{{ os_metadata.meta.compute | default(false) | bool }}" + enable_resolv_conf: "{{ os_metadata.meta.resolv_conf | default(false) | bool }}" + enable_etc_hosts: "{{ os_metadata.meta.etc_hosts | default(false) | bool }}" + enable_nfs: "{{ os_metadata.meta.nfs | default(false) | bool }}" + enable_manila: "{{ os_metadata.meta.manila | default(false) | bool }}" + enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}" + enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] From a7876a665d2ac6ea5450eec3c4a971b59889a2b7 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:27:08 +0000 Subject: [PATCH 164/182] Support additional volumes on compute nodes (#528) --- .../terraform/compute.tf | 1 + .../terraform/compute/nodes.tf | 30 +++++++++++++++++++ .../terraform/compute/variables.tf | 12 ++++++++ .../terraform/variables.tf | 5 ++++ 4 files changed, 48 insertions(+) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index 37c9aad10..ba9da127c 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -18,6 +18,7 @@ module "compute" { vnic_profile = lookup(each.value, "vnic_profile", var.vnic_profile) volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) + extra_volumes = lookup(each.value, "extra_volumes", {}) key_pair = var.key_pair environment_root = var.environment_root diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index 7a2a706a6..ab869e28e 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -1,3 +1,33 @@ +locals { + all_compute_volumes = {for v in setproduct(var.nodes, keys(var.extra_volumes)): "${v[0]}-${v[1]}" => {"node" = v[0], "volume" = v[1]}} + # e.g. with + # var.nodes = ["compute-0", "compute-1"] + # var.extra_volumes = { + # "vol-a" = {size = 10}, + # "vol-b" = {size = 20} + # } + # this is a mapping with + # keys "compute-0-vol-a", "compute-0-vol-b" ... + # values which are a mapping e.g. {"node"="compute-0", "volume"="vol-a"} +} + +resource "openstack_blockstorage_volume_v3" "compute" { + + for_each = local.all_compute_volumes + + name = "${var.cluster_name}-${each.key}" + description = "Compute node ${each.value.node} volume ${each.value.volume}" + size = var.extra_volumes[each.value.volume].size +} + +resource "openstack_compute_volume_attach_v2" "compute" { + + for_each = local.all_compute_volumes + + instance_id = openstack_compute_instance_v2.compute["${each.value.node}"].id + volume_id = openstack_blockstorage_volume_v3.compute["${each.key}"].id +} + resource "openstack_networking_port_v2" "compute" { for_each = toset(var.nodes) diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index 3655c9e65..72bcf08fd 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -64,6 +64,18 @@ variable "root_volume_size" { default = 40 } +variable "extra_volumes" { + description = <<-EOF + Mapping defining additional volumes to create and attach. + Keys are unique volume name. + Values are a mapping with: + size: Size of volume in GB + **NB**: The order in /dev is not guaranteed to match the mapping + EOF + type = any + default = {} +} + variable "security_group_ids" { type = list } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index f2cfe1215..4d8058208 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -54,6 +54,11 @@ variable "compute" { vnic_profile: Overrides variable vnic_profile volume_backed_instances: Overrides variable volume_backed_instances root_volume_size: Overrides variable root_volume_size + extra_volumes: Mapping defining additional volumes to create and attach + Keys are unique volume name. + Values are a mapping with: + size: Size of volume in GB + **NB**: The order in /dev is not guaranteed to match the mapping EOF } From bc16dbaa25da04d4a350413d343a18fbcb0f7e68 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 9 Jan 2025 15:14:02 +0000 Subject: [PATCH 165/182] bump image --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 47681ea8a..cb4b4e32e 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250108-1703-e515b902", - "RL9": "openhpc-RL9-250108-1703-e515b902" + "RL8": "openhpc-RL8-250109-1431-3faa8138", + "RL9": "openhpc-RL9-250109-1431-3faa8138" } } From 2903223f34394c0f0d58190206d21f7c6ca08e18 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 9 Jan 2025 20:26:21 +0000 Subject: [PATCH 166/182] Support SSSD and optionally LDAP (#438) * support sssd configuration * make sssd-ldap optional * SSSD PR review tweaks * enable installing sssd in fatimage * install sssd and sssd-ldap packages in stackhpc fatimage * fix sssd being enabled in fatimage * bump CI image * simplify sssd-ldap package installation in fatimage * bump CI image * enable mkhomedir * add sshd role * auto enable ssh passwords if using ldap * actually run sshd role * make sshd config more flexible * add basic_users_override_sssd flag * port PR comment re. basic_users docs * add sssd-ldap package during stackhpc build only * bump CI image * add missing empty sssd group * remove deprecated & empty block_devices group * regularise common groups & everything groups template a bit * bumb CI image * sssd review comments Co-authored-by: Will Szumski --------- Co-authored-by: Will Szumski --- ansible/.gitignore | 4 +++ ansible/bootstrap.yml | 9 ++++++ ansible/fatimage.yml | 5 ++++ ansible/iam.yml | 9 ++++++ ansible/roles/basic_users/README.md | 1 + ansible/roles/basic_users/defaults/main.yml | 1 + ansible/roles/basic_users/tasks/main.yml | 17 ++++++++++- ansible/roles/sshd/README.md | 9 ++++++ ansible/roles/sshd/defaults/main.yml | 3 ++ ansible/roles/sshd/handlers/main.yml | 4 +++ ansible/roles/sshd/tasks/configure.yml | 15 ++++++++++ ansible/roles/sshd/tasks/main.yml | 1 + ansible/roles/sshd/templates/sshd.conf.j2 | 2 ++ ansible/roles/sssd/README.md | 18 ++++++++++++ ansible/roles/sssd/defaults/main.yml | 12 ++++++++ ansible/roles/sssd/handlers/main.yml | 5 ++++ ansible/roles/sssd/tasks/configure.yml | 28 +++++++++++++++++++ ansible/roles/sssd/tasks/install.yml | 13 +++++++++ ansible/roles/sssd/tasks/main.yml | 2 ++ environments/.stackhpc/inventory/extra_groups | 4 +++ .../inventory/group_vars/builder.yml | 2 ++ .../terraform/cluster_image.auto.tfvars.json | 4 +-- .../common/inventory/group_vars/all/sshd.yaml | 1 + .../inventory/group_vars/builder/defaults.yml | 2 ++ environments/common/inventory/groups | 18 ++++++++---- environments/common/layouts/everything | 9 +++++- 26 files changed, 188 insertions(+), 10 deletions(-) create mode 100644 ansible/roles/sshd/README.md create mode 100644 ansible/roles/sshd/defaults/main.yml create mode 100644 ansible/roles/sshd/handlers/main.yml create mode 100644 ansible/roles/sshd/tasks/configure.yml create mode 100644 ansible/roles/sshd/tasks/main.yml create mode 100644 ansible/roles/sshd/templates/sshd.conf.j2 create mode 100644 ansible/roles/sssd/README.md create mode 100644 ansible/roles/sssd/defaults/main.yml create mode 100644 ansible/roles/sssd/handlers/main.yml create mode 100644 ansible/roles/sssd/tasks/configure.yml create mode 100644 ansible/roles/sssd/tasks/install.yml create mode 100644 ansible/roles/sssd/tasks/main.yml create mode 100644 environments/common/inventory/group_vars/all/sshd.yaml diff --git a/ansible/.gitignore b/ansible/.gitignore index a7197ff4c..1cabb8ad8 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -58,6 +58,10 @@ roles/* !roles/squid/** !roles/tuned/ !roles/tuned/** +!roles/sssd/ +!roles/sssd/** +!roles/sshd/ +!roles/sshd/** !roles/compute_init/ !roles/compute_init/** !roles/k3s/ diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index e2497d9c6..88d9274b3 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -110,6 +110,15 @@ policy: "{{ selinux_policy }}" register: sestatus +- hosts: sshd + tags: sshd + gather_facts: no + become: yes + tasks: + - name: Configure sshd + import_role: + name: sshd + - hosts: dnf_repos become: yes tasks: diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 9a8828a35..e5de38edf 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -54,6 +54,11 @@ name: freeipa tasks_from: client-install.yml when: "'freeipa_client' in group_names" + - name: Install sssd + import_role: + name: sssd + tasks_from: install.yml + when: "'sssd' in group_names" # - import_playbook: filesystems.yml: - name: Install nfs packages diff --git a/ansible/iam.yml b/ansible/iam.yml index 0286b9df3..857b8f840 100644 --- a/ansible/iam.yml +++ b/ansible/iam.yml @@ -40,3 +40,12 @@ import_role: name: freeipa tasks_from: users.yml + +- hosts: sssd + become: yes + gather_facts: no + tags: sssd + tasks: + - name: Configure sssd + import_role: + name: sssd diff --git a/ansible/roles/basic_users/README.md b/ansible/roles/basic_users/README.md index 4b75100ca..65fdd2c4c 100644 --- a/ansible/roles/basic_users/README.md +++ b/ansible/roles/basic_users/README.md @@ -24,6 +24,7 @@ Role Variables - An additional key `sudo` may optionally be specified giving a string (possibly multiline) defining sudo rules to be templated. - Any other keys may present for other purposes (i.e. not used by this role). - `basic_users_groups`: Optional, default empty list. A list of mappings defining information for each group. Mapping keys/values are passed through as parameters to [ansible.builtin.group](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/group_module.html) and default values are as given there. +- `basic_users_override_sssd`: Optional bool, default false. Whether to disable `sssd` when ensuring users/groups exist with this role. Permits creating local users/groups even if they clash with users provided via sssd (e.g. from LDAP). Ignored if host is not in group `sssd` as well. Note with this option active `sssd` will be stopped and restarted each time this role is run. Dependencies ------------ diff --git a/ansible/roles/basic_users/defaults/main.yml b/ansible/roles/basic_users/defaults/main.yml index 9f34bdf4c..e6c6eafaa 100644 --- a/ansible/roles/basic_users/defaults/main.yml +++ b/ansible/roles/basic_users/defaults/main.yml @@ -7,3 +7,4 @@ basic_users_userdefaults: shell: "{{'/sbin/nologin' if 'control' in group_names else omit }}" basic_users_users: [] basic_users_groups: [] +basic_users_override_sssd: false diff --git a/ansible/roles/basic_users/tasks/main.yml b/ansible/roles/basic_users/tasks/main.yml index c27d024b4..c6733fb89 100644 --- a/ansible/roles/basic_users/tasks/main.yml +++ b/ansible/roles/basic_users/tasks/main.yml @@ -7,7 +7,16 @@ label: "{{ item.name }}" when: - "item.state | default('present') == 'absent'" - + +- name: Stop sssd if required + systemd: + name: sssd + state: stopped + register: _stop_sssd + when: + - "'sssd' in group_names" + - basic_users_override_sssd | bool + - name: Create groups ansible.builtin.group: "{{ item }}" loop: "{{ basic_users_groups }}" @@ -19,6 +28,12 @@ label: "{{ item.name }} [{{ item.state | default('present') }}]" register: basic_users_info +- name: Restart sssd if required + systemd: + name: sssd + state: started + when: _stop_sssd is changed + - name: Write supplied public key as authorized for SSH access authorized_key: user: "{{ item.name }}" diff --git a/ansible/roles/sshd/README.md b/ansible/roles/sshd/README.md new file mode 100644 index 000000000..0fac1d189 --- /dev/null +++ b/ansible/roles/sshd/README.md @@ -0,0 +1,9 @@ +# sshd + +Configure sshd. + +## Role variables + +- `sshd_password_authentication`: Optional bool. Whether to enable password login. Default `false`. +- `sshd_conf_src`: Optional string. Path to sshd configuration template. Default is in-role template. +- `sshd_conf_dest`: Optional string. Path to destination for sshd configuration file. Default is `/etc/ssh/sshd_config.d/10-ansible.conf` which overides `50-{cloud-init,redhat}` files, if present. diff --git a/ansible/roles/sshd/defaults/main.yml b/ansible/roles/sshd/defaults/main.yml new file mode 100644 index 000000000..672305799 --- /dev/null +++ b/ansible/roles/sshd/defaults/main.yml @@ -0,0 +1,3 @@ +sshd_password_authentication: false +sshd_conf_src: sshd.conf.j2 +sshd_conf_dest: /etc/ssh/sshd_config.d/10-ansible.conf diff --git a/ansible/roles/sshd/handlers/main.yml b/ansible/roles/sshd/handlers/main.yml new file mode 100644 index 000000000..e11aa7801 --- /dev/null +++ b/ansible/roles/sshd/handlers/main.yml @@ -0,0 +1,4 @@ +- name: Restart sshd + systemd: + name: sshd + state: restarted diff --git a/ansible/roles/sshd/tasks/configure.yml b/ansible/roles/sshd/tasks/configure.yml new file mode 100644 index 000000000..8aafb5c19 --- /dev/null +++ b/ansible/roles/sshd/tasks/configure.yml @@ -0,0 +1,15 @@ +- name: Template sshd configuration + # NB: If parameters are defined multiple times the first value wins; + # The default /etc/ssh/sshd_config has + # Include /etc/ssh/sshd_config.d/*.conf + # early on, which is generally held to be the correct approach, so adding + # values to the end of that file won't work + template: + src: "{{ sshd_conf_src }}" + dest: "{{ sshd_conf_dest }}" + owner: root + group: root + mode: u=rw,go= + validate: sshd -t -f %s + notify: + - Restart sshd diff --git a/ansible/roles/sshd/tasks/main.yml b/ansible/roles/sshd/tasks/main.yml new file mode 100644 index 000000000..84f493457 --- /dev/null +++ b/ansible/roles/sshd/tasks/main.yml @@ -0,0 +1 @@ +- import_tasks: configure.yml diff --git a/ansible/roles/sshd/templates/sshd.conf.j2 b/ansible/roles/sshd/templates/sshd.conf.j2 new file mode 100644 index 000000000..2746f0642 --- /dev/null +++ b/ansible/roles/sshd/templates/sshd.conf.j2 @@ -0,0 +1,2 @@ +# {{ ansible_managed }} +PasswordAuthentication {{ 'yes' if sshd_password_authentication | bool else 'no' }} diff --git a/ansible/roles/sssd/README.md b/ansible/roles/sssd/README.md new file mode 100644 index 000000000..da4e63f31 --- /dev/null +++ b/ansible/roles/sssd/README.md @@ -0,0 +1,18 @@ +# sssd + +Install and configure [sssd](https://sssd.io/docs/introduction.html). + + +## Role variables + +The only required configuration is to create a [sssd.conf](https://www.mankier.com/5/sssd.conf) template at the location specified by `sssd_conf_src`. + +- `sssd_packages`: Optional list. Packages to install. +- `sssd_ldap_install`: Optional bool. Whether to install packages enabling SSSD to authenticate against LDAP. Default `false`. +- `sssd_ldap_packages`: Optional list. Packages to install when using `sssd_ldap_install`. +- `sssd_enable_mkhomedir`: Optional bool. Whether to enable creation of home directories on login. Default `false`. +- `sssd_mkhomedir_packages`: Optional list. Packages to install when using `sssd_enable_mkhomedir`. +- `sssd_conf_src`: Optional string. Path to `sssd.conf` template. Default (which must be created) is `{{ appliances_environment_root }}/files/sssd.conf.j2`. +- `sssd_conf_dest`: Optional string. Path to destination for `sssd.conf`. Default `/etc/sssd/sssd.conf`. +- `sssd_started`: Optional bool. Whether `sssd` service should be started. +- `sssd_enabled`: Optional bool. Whether `sssd` service should be enabled. diff --git a/ansible/roles/sssd/defaults/main.yml b/ansible/roles/sssd/defaults/main.yml new file mode 100644 index 000000000..5bc58c990 --- /dev/null +++ b/ansible/roles/sssd/defaults/main.yml @@ -0,0 +1,12 @@ +sssd_packages: + - sssd-common +sssd_install_ldap: false +sssd_ldap_packages: + - sssd-ldap +sssd_enable_mkhomedir: false +sssd_mkhomedir_packages: + - oddjob-mkhomedir +sssd_conf_src: "{{ appliances_environment_root }}/files/sssd.conf.j2" +sssd_conf_dest: /etc/sssd/sssd.conf +sssd_started: true +sssd_enabled: true diff --git a/ansible/roles/sssd/handlers/main.yml b/ansible/roles/sssd/handlers/main.yml new file mode 100644 index 000000000..72c36e736 --- /dev/null +++ b/ansible/roles/sssd/handlers/main.yml @@ -0,0 +1,5 @@ +- name: Restart sssd + systemd: + name: sssd + state: restarted + when: sssd_started | bool diff --git a/ansible/roles/sssd/tasks/configure.yml b/ansible/roles/sssd/tasks/configure.yml new file mode 100644 index 000000000..ae636e9dd --- /dev/null +++ b/ansible/roles/sssd/tasks/configure.yml @@ -0,0 +1,28 @@ +- name: Manage sssd.conf configuration + template: + src: "{{ sssd_conf_src }}" + dest: "{{ sssd_conf_dest }}" + owner: root + group: root + mode: u=rw,go= + notify: "Restart sssd" + +- meta: flush_handlers + +- name: Ensure sssd service state + systemd: + name: sssd + state: "{{ 'started' if sssd_started | bool else 'stopped' }}" + enabled: "{{ sssd_enabled | bool }}" + +- name: Get current authselect configuration + command: authselect current --raw + changed_when: false + failed_when: + - _authselect_current.rc != 0 + - "'No existing configuration detected' not in _authselect_current.stdout" + register: _authselect_current # stdout: sssd with-mkhomedir + +- name: Configure nsswitch and PAM for SSSD + command: "authselect select sssd --force{% if sssd_enable_mkhomedir | bool %} with-mkhomedir{% endif %}" + when: "'sssd' not in _authselect_current.stdout" diff --git a/ansible/roles/sssd/tasks/install.yml b/ansible/roles/sssd/tasks/install.yml new file mode 100644 index 000000000..97aa82a2f --- /dev/null +++ b/ansible/roles/sssd/tasks/install.yml @@ -0,0 +1,13 @@ +- name: Ensure sssd packages are installed + dnf: + name: "{{ sssd_packages + sssd_ldap_packages if (sssd_install_ldap | bool) else [] }}" + +- name: Control if sssd should start on boot + # Needs to be done here to prevent starting after image build, is enabled by default + systemd: + name: sssd + enabled: "{{ sssd_enabled | bool }}" + +- name: Ensure mkhomedir packages are installed if required + dnf: + name: "{{ sssd_mkhomedir_packages }}" diff --git a/ansible/roles/sssd/tasks/main.yml b/ansible/roles/sssd/tasks/main.yml new file mode 100644 index 000000000..2b65e84b4 --- /dev/null +++ b/ansible/roles/sssd/tasks/main.yml @@ -0,0 +1,2 @@ +- import_tasks: install.yml +- import_tasks: configure.yml diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index 7c9a7c774..2531b803e 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -31,3 +31,7 @@ compute [squid:children] # Install squid into fat image builder + +[sssd:children] +# Install sssd into fat image +builder diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 5130e9d84..10b15adac 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,3 +1,5 @@ +#update_enable: false # Can uncomment for speed debugging non-update related build issues +sssd_install_ldap: true # include sssd-ldap package in fatimage # update_enable: false # Can uncomment for speed debugging non-update related build issues # Uncomment below to use CI pulp servers diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 47681ea8a..3c1e19058 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250108-1703-e515b902", - "RL9": "openhpc-RL9-250108-1703-e515b902" + "RL8": "openhpc-RL8-250109-1444-ecea8219", + "RL9": "openhpc-RL9-250109-1444-ecea8219" } } diff --git a/environments/common/inventory/group_vars/all/sshd.yaml b/environments/common/inventory/group_vars/all/sshd.yaml new file mode 100644 index 000000000..5d4ed228f --- /dev/null +++ b/environments/common/inventory/group_vars/all/sshd.yaml @@ -0,0 +1 @@ +sshd_password_authentication: "{{ sssd_install_ldap | default(false) | bool }}" diff --git a/environments/common/inventory/group_vars/builder/defaults.yml b/environments/common/inventory/group_vars/builder/defaults.yml index b43d9f03c..dae4edd9a 100644 --- a/environments/common/inventory/group_vars/builder/defaults.yml +++ b/environments/common/inventory/group_vars/builder/defaults.yml @@ -22,4 +22,6 @@ squid_cache_disk: 0 # just needs to be defined squid_cache_mem: 0 tuned_started: false tuned_enabled: false +sssd_started: false +sssd_enabled: false appliances_mode: build diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index cbc69d800..1d756ed66 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -13,9 +13,6 @@ login control compute -[eessi:children] -# Hosts on which EESSI stack should be configured - [hpctests:children] # Login group to use for running mpi-based testing. login @@ -79,9 +76,6 @@ cluster # Hosts to install firewalld on - see ansible/roles/filewalld fail2ban -[block_devices] -# Superset of hosts to configure filesystems on - see ansible/roles/block_devices/README.md - [basic_users] # Add `openhpc` group to add slurm users via creation of users on each node. @@ -118,12 +112,18 @@ freeipa_client [cuda] # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md +[eessi] +# Hosts on which EESSI stack should be configured + [resolv_conf] # Allows defining nameservers in /etc/resolv.conf - see ansible/roles/resolv_conf/README.md [proxy] # Hosts to configure http/s proxies - see ansible/roles/proxy/README.md +[manila] +# Hosts to configure for manila fileshares + [persist_hostkeys] # Hosts to persist hostkeys for across reimaging. NB: Requires appliances_state_dir on hosts. @@ -136,6 +136,12 @@ freeipa_client [ansible_init] # Hosts to run linux-anisble-init +[sssd] +# Hosts to configure sssd on + +[sshd] +# Hosts where the OpenSSH server daemon should be configured + [compute_init] # EXPERIMENTAL: Compute hosts to enable joining cluster on boot on diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index d3b8fe040..4293cbca0 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -60,6 +60,7 @@ cluster # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md [eessi:children] +# Hosts on which EESSI stack should be configured openhpc [resolv_conf] @@ -83,9 +84,15 @@ openondemand # Hosts to run TuneD configuration [ansible_init:children] -# Hosts to run ansible-init +# Hosts to run linux-anisble-init cluster +[sssd] +# Hosts to configure sssd on + +[sshd] +# Hosts where the OpenSSH server daemon should be configured + [compute_init:children] # EXPERIMENTAL: Compute hosts to enable joining cluster on boot on compute From d2e18d0c5346509abc7546bdd70fc74a5ca87e5e Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 10 Jan 2025 09:16:10 +0000 Subject: [PATCH 167/182] bump image --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 3c1e19058..3c43e02eb 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250109-1444-ecea8219", - "RL9": "openhpc-RL9-250109-1444-ecea8219" + "RL8": "openhpc-RL8-250109-2102-5193ba2f", + "RL9": "openhpc-RL9-250110-0016-5193ba2f" } } From 3b09bd144361dcd4243cb89dbebf48fc68e4ba68 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Fri, 10 Jan 2025 16:50:47 +0100 Subject: [PATCH 168/182] Fix various typos in documentation --- README.md | 14 +++++++------- docs/image-build.md | 2 +- docs/monitoring-and-logging.md | 2 +- docs/openondemand.md | 24 ++++++++++++------------ docs/production.md | 2 +- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 593837ccd..dd6451011 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,10 @@ This repository contains playbooks and configuration to define a Slurm-based HPC - [Rocky Linux](https://rockylinux.org/)-based hosts. - [OpenTofu](https://opentofu.org/) configurations to define the cluster's infrastructure-as-code. - Packages for Slurm and MPI software stacks from [OpenHPC](https://openhpc.community/). -- Shared fileystem(s) using NFS (with in-cluster or external servers) or [CephFS](https://docs.ceph.com/en/latest/cephfs/) via [Openstack Manila](https://wiki.openstack.org/wiki/Manila). +- Shared fileystem(s) using NFS (with in-cluster or external servers) or [CephFS](https://docs.ceph.com/en/latest/cephfs/) via [OpenStack Manila](https://wiki.openstack.org/wiki/Manila). - Slurm accounting using a MySQL database. - Monitoring integrated with Slurm jobs using Prometheus, ElasticSearch and Grafana. -- A web-based portal from [OpenOndemand](https://openondemand.org/). +- A web-based portal from [Open OnDemand](https://openondemand.org/). - Production-ready default Slurm configurations for access and memory limits. - [Packer](https://developer.hashicorp.com/packer)-based image build configurations for node images. @@ -25,7 +25,7 @@ The default configuration in this repository may be used to create a cluster to - Persistent state backed by an OpenStack volume. - NFS-based shared file system backed by another OpenStack volume. -Note that the OpenOndemand portal and its remote apps are not usable with this default configuration. +Note that the Open OnDemand portal and its remote apps are not usable with this default configuration. It requires an OpenStack cloud, and an Ansible "deploy host" with access to that cloud. @@ -33,7 +33,7 @@ Before starting ensure that: - You have root access on the deploy host. - You can create instances using a Rocky 9 GenericCloud image (or an image based on that). - **NB**: In general it is recommended to use the [latest released image](https://github.com/stackhpc/ansible-slurm-appliance/releases) which already contains the required packages. This is built and tested in StackHPC's CI. -- You have a SSH keypair defined in OpenStack, with the private key available on the deploy host. +- You have an SSH keypair defined in OpenStack, with the private key available on the deploy host. - Created instances have access to internet (note proxies can be setup through the appliance if necessary). - Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance). @@ -66,7 +66,7 @@ Use the `cookiecutter` template to create a new environment to hold your configu and follow the prompts to complete the environment name and description. -**NB:** In subsequent sections this new environment is refered to as `$ENV`. +**NB:** In subsequent sections this new environment is referred to as `$ENV`. Activate the new environment: @@ -124,8 +124,8 @@ where the IP of the login node is given in `environments/$ENV/inventory/hosts.ym ## Overview of directory structure - `environments/`: See [docs/environments.md](docs/environments.md). -- `ansible/`: Contains the ansible playbooks to configure the infrastruture. -- `packer/`: Contains automation to use Packer to build machine images for an enviromment - see the README in this directory for further information. +- `ansible/`: Contains the ansible playbooks to configure the infrastructure. +- `packer/`: Contains automation to use Packer to build machine images for an environment - see the README in this directory for further information. - `dev/`: Contains development tools. For further information see the [docs](docs/) directory. diff --git a/docs/image-build.md b/docs/image-build.md index db51265a3..dc968ebfd 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -51,7 +51,7 @@ To build either a site-specific fat image from scratch, or to extend an existing openstack image unset --property signature_verified $SOURCE_IMAGE - then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). + then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [OpenStack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). 6. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index 3e3de38c0..db228d410 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -96,7 +96,7 @@ The `grafana` group controls the placement of the grafana service. Load balancin ### Access -If Open Ondemand is enabled then by default this is used to proxy Grafana, otherwise Grafana is accessed through the first . See `grafana_url` in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). The port used (variable `grafana_port`) defaults to `3000`. +If Open OnDemand is enabled then by default this is used to proxy Grafana, otherwise Grafana is accessed through the first . See `grafana_url` in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). The port used (variable `grafana_port`) defaults to `3000`. The default credentials for the admin user are: diff --git a/docs/openondemand.md b/docs/openondemand.md index 3bd6c9e9f..6b501d20b 100644 --- a/docs/openondemand.md +++ b/docs/openondemand.md @@ -1,28 +1,28 @@ # Overview -The appliance can deploy the Open Ondemand portal. This page describes how to enable this and the default appliance configuration/behaviour. Note that detailed configuration documentation is provided by: +The appliance can deploy the Open OnDemand portal. This page describes how to enable this and the default appliance configuration/behaviour. Note that detailed configuration documentation is provided by: - The README for the included `openondemand` role in this repo - [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md). - The README and default variables for the underlying "official" role which the above wraps - [Open OnDemand Ansible Role](https://github.com/OSC/ood-ansible) -- The documentation for Open Ondemand [itself](https://osc.github.io/ood-documentation/latest/index.html) +- The documentation for Open OnDemand [itself](https://osc.github.io/ood-documentation/latest/index.html) This appliance can deploy and configure: -- The Open Ondemand server itself (usually on a single login node). +- The Open OnDemand server itself (usually on a single login node). - User authentication using one of: - An external OIDC provider. - HTTP basic authenication and PAM. - Virtual desktops on compute nodes. - Jupyter nodebook servers on compute nodes. -- Proxying of Grafana (usually deployed on the control node) via the Open Ondemand portal. -- Links to additional filesystems and pages from the Open Ondemand Dashboard. -- A Prometheus exporter for the Open Ondemand server and related Grafana dashboard +- Proxying of Grafana (usually deployed on the control node) via the Open OnDemand portal. +- Links to additional filesystems and pages from the Open OnDemand Dashboard. +- A Prometheus exporter for the Open OnDemand server and related Grafana dashboard For examples of all of the above see the `smslabs-example` environment in this repo. -# Enabling Open Ondemand -To enable the Open Ondemand server, add single host to the `openondemand` inventory group. Generally, this should be a node in the `login` group, as Open Ondemand must be able to access Slurm commands. +# Enabling Open OnDemand +To enable the Open OnDemand server, add single host to the `openondemand` inventory group. Generally, this should be a node in the `login` group, as Open OnDemand must be able to access Slurm commands. -To enable compute nodes for virtual desktops or Jupyter notebook servers (accessed through the Open Ondemand portal), add nodes/groups to the `openondemand_desktop` and `openondemand_jupyter` inventory groups respectively. These may be all or a subset of the `compute` group. +To enable compute nodes for virtual desktops or Jupyter notebook servers (accessed through the Open OnDemand portal), add nodes/groups to the `openondemand_desktop` and `openondemand_jupyter` inventory groups respectively. These may be all or a subset of the `compute` group. The above functionality is configured by running the `ansible/portal.yml` playbook. This is automatically run as part of `ansible/site.yml`. @@ -30,7 +30,7 @@ The above functionality is configured by running the `ansible/portal.yml` playbo See the [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md) for more details on the variables described below. -The following variables have been given default values to allow Open Ondemand to work in a newly created environment without additional configuration, but generally should be overridden in `environment/site/inventory/group_vars/all/` with site-specific values: +The following variables have been given default values to allow Open OnDemand to work in a newly created environment without additional configuration, but generally should be overridden in `environment/site/inventory/group_vars/all/` with site-specific values: - `openondemand_servername` - this must be defined for both `openondemand` and `grafana` hosts (when Grafana is enabled). Default is `ansible_host` (i.e. the IP address) of the first host in the `openondemand` group. - `openondemand_auth` and any corresponding options. Defaults to `basic_pam`. - `openondemand_desktop_partition` and `openondemand_jupyter_partition` if the corresponding inventory groups are defined. Defaults to the first compute group defined in the `compute` Terraform variable in `environments/$ENV/terraform`. @@ -41,9 +41,9 @@ It is also recommended to set: If shared filesystems other than `$HOME` are available, add paths to `openondemand_filesapp_paths`. -The appliance automatically configures Open Ondemand to proxy Grafana and adds a link to it on the Open Ondemand dashboard. This means no external IP (or SSH proxying etc) is required to access Grafana (which by default is deployed on the control node). To allow users to authenticate to Grafana, the simplest option is to enable anonymous (View-only) login by setting `grafana_auth_anonymous` (see [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml)[^1]). +The appliance automatically configures Open OnDemand to proxy Grafana and adds a link to it on the Open OnDemand dashboard. This means no external IP (or SSH proxying etc) is required to access Grafana (which by default is deployed on the control node). To allow users to authenticate to Grafana, the simplest option is to enable anonymous (View-only) login by setting `grafana_auth_anonymous` (see [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml)[^1]). -[^1]: Note that if `openondemand_auth` is `basic_pam` and anonymous Grafana login is enabled, the appliance will (by default) configure Open Ondemand's Apache server to remove the Authorisation header from proxying of all `node/` addresses. This is done as otherwise Grafana tries to use this header to authenticate, which fails with the default configuration where only the admin Grafana user `grafana` is created. Note that the removal of this header in this configuration means it cannot be used to authenticate proxied interactive applications - however the appliance-deployed remote desktop and Jupyter Notebook server applications use other authentication methods. An alternative if using `basic_pam` is not to enable anonymous Grafana login and to create Grafana users matching the local users (e.g. in `environments//hooks/post.yml`). +[^1]: Note that if `openondemand_auth` is `basic_pam` and anonymous Grafana login is enabled, the appliance will (by default) configure Open OnDemand's Apache server to remove the Authorisation header from proxying of all `node/` addresses. This is done as otherwise Grafana tries to use this header to authenticate, which fails with the default configuration where only the admin Grafana user `grafana` is created. Note that the removal of this header in this configuration means it cannot be used to authenticate proxied interactive applications - however the appliance-deployed remote desktop and Jupyter Notebook server applications use other authentication methods. An alternative if using `basic_pam` is not to enable anonymous Grafana login and to create Grafana users matching the local users (e.g. in `environments//hooks/post.yml`). # Access By default the appliance authenticates against OOD with basic auth through PAM. When creating a new environment, a new user with username `demo_user` will be created. Its password is found under `vault_openondemand_default_user` in the appliance secrets store in `environments/{ENV}/inventory/group_vars/all/secrets.yml`. Other users can be defined by overriding the `basic_users_users` variable in your environment (templated into `environments/{ENV}/inventory/group_vars/all/basic_users.yml` by default). diff --git a/docs/production.md b/docs/production.md index 9f1b3f7bf..59b9f3775 100644 --- a/docs/production.md +++ b/docs/production.md @@ -96,7 +96,7 @@ and referenced from the `site` and `production` environments, e.g.: cluster ``` -- Configure Open OpenOndemand - see [specific documentation](openondemand.README.md). +- Configure Open OnDemand - see [specific documentation](openondemand.README.md). - Remove the `demo_user` user from `environments/$ENV/inventory/group_vars/all/basic_users.yml` From 438ed3ad6f40916e4256070846724f298f8c274d Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 10 Jan 2025 17:37:44 +0000 Subject: [PATCH 169/182] adjust check_slurm logic to deal with idle* state --- ansible/ci/check_slurm.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml index d95c5bb5c..6507caf08 100644 --- a/ansible/ci/check_slurm.yml +++ b/ansible/ci/check_slurm.yml @@ -6,9 +6,9 @@ shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name register: sinfo changed_when: false - until: "'boot' not in sinfo.stdout_lines" - retries: 5 - delay: 10 + until: not ("boot" in sinfo.stdout or "idle*" in sinfo.stdout) + retries: 10 + delay: 5 - name: Check nodes have expected slurm state assert: that: sinfo.stdout_lines == expected_sinfo From 37c1dcebd4489f88d6e60de10ea89cac1caec26b Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 13 Jan 2025 09:29:47 +0000 Subject: [PATCH 170/182] Fix nightly cleanup to deal with duplicate server names --- .github/workflows/nightly-cleanup.yml | 40 +++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index f76bd51a9..577a20775 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -63,15 +63,43 @@ jobs: echo "No clusters to delete." exit 0 fi - + for cluster_prefix in ${ci_clusters} do echo "Processing cluster: $cluster_prefix" - TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value) - if [[ $TAGS =~ "keep" ]]; then - echo "Skipping ${cluster_prefix} - control instance is tagged as keep" - else - ./dev/delete-cluster.py ${cluster_prefix} --force + + # Retrieve all servers matching the cluster prefix + SERVERS=$(openstack server list --name "${cluster_prefix}-.*" -f value -c ID -c Name) + + if [[ -z "$SERVERS" ]]; then + echo "No servers found for cluster ${cluster_prefix}" + continue + fi + + KEEP_FLAG=false + while IFS= read -r line; do + SERVER_ID=$(echo "$line" | awk '{print $1}') + SERVER_NAME=$(echo "$line" | awk '{print $2}') + + # Check tags only on control nodes + if [[ "$SERVER_NAME" == "${cluster_prefix}-control" ]]; then + TAGS=$(openstack server show $SERVER_ID --column tags --format value) + if [[ $TAGS =~ "keep" ]]; then + echo "Skipping cluster ${cluster_prefix} - control instance is tagged as keep" + KEEP_FLAG=true + break + fi + fi + done <<< "$SERVERS" + + # Delete all servers if control node is not tagged with keep + if [[ "$KEEP_FLAG" == false ]]; then + echo "Deleting all servers in cluster ${cluster_prefix}" + while IFS= read -r line; do + SERVER_ID=$(echo "$line" | awk '{print $1}') + echo "Deleting server $SERVER_ID" + openstack server delete $SERVER_ID || true + done <<< "$SERVERS" fi done shell: bash From 9b1bf122847f8345ac70e764fd81300829de73d0 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 13 Jan 2025 10:03:32 +0000 Subject: [PATCH 171/182] Update nightly-cleanup.yml --- .github/workflows/nightly-cleanup.yml | 40 ++++----------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 577a20775..8ea3ca74d 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -63,43 +63,15 @@ jobs: echo "No clusters to delete." exit 0 fi - + for cluster_prefix in ${ci_clusters} do echo "Processing cluster: $cluster_prefix" - - # Retrieve all servers matching the cluster prefix - SERVERS=$(openstack server list --name "${cluster_prefix}-.*" -f value -c ID -c Name) - - if [[ -z "$SERVERS" ]]; then - echo "No servers found for cluster ${cluster_prefix}" - continue - fi - - KEEP_FLAG=false - while IFS= read -r line; do - SERVER_ID=$(echo "$line" | awk '{print $1}') - SERVER_NAME=$(echo "$line" | awk '{print $2}') - - # Check tags only on control nodes - if [[ "$SERVER_NAME" == "${cluster_prefix}-control" ]]; then - TAGS=$(openstack server show $SERVER_ID --column tags --format value) - if [[ $TAGS =~ "keep" ]]; then - echo "Skipping cluster ${cluster_prefix} - control instance is tagged as keep" - KEEP_FLAG=true - break - fi - fi - done <<< "$SERVERS" - - # Delete all servers if control node is not tagged with keep - if [[ "$KEEP_FLAG" == false ]]; then - echo "Deleting all servers in cluster ${cluster_prefix}" - while IFS= read -r line; do - SERVER_ID=$(echo "$line" | awk '{print $1}') - echo "Deleting server $SERVER_ID" - openstack server delete $SERVER_ID || true - done <<< "$SERVERS" + TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value) + if [[ $TAGS =~ "keep" ]]; then + echo "Skipping ${cluster_prefix} - control instance is tagged as keep" + else + ./dev/delete-cluster.py ${cluster_prefix} --force fi done shell: bash From f1fd75e772d4c9122bb7fcb79279c7a26b2f5f5b Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 13 Jan 2025 10:21:42 +0000 Subject: [PATCH 172/182] Update nightly-cleanup.yml --- .github/workflows/nightly-cleanup.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 8ea3ca74d..e15049f08 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -63,11 +63,20 @@ jobs: echo "No clusters to delete." exit 0 fi - + for cluster_prefix in ${ci_clusters} do echo "Processing cluster: $cluster_prefix" - TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value) + # Get all servers with the matching name for control node + CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json) + SERVER_COUNT=$(echo "$CONTROL_SERVERS" | jq length) + + if [[ $SERVER_COUNT -gt 1 ]]; then + echo "Warning: More than one server found for control node '${cluster_prefix}-control'." + continue + fi + TAGS=$(echo "$CONTROL_SERVERS" | jq -r '.[0].Tags' ) + if [[ $TAGS =~ "keep" ]]; then echo "Skipping ${cluster_prefix} - control instance is tagged as keep" else From edbcebc09b1321c86bcea1a7f3f181ae70b7ac14 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Mon, 13 Jan 2025 13:39:06 +0000 Subject: [PATCH 173/182] Fix tag determination --- .github/workflows/nightly-cleanup.yml | 29 ++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index e15049f08..0f7156fad 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -70,17 +70,28 @@ jobs: # Get all servers with the matching name for control node CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json) SERVER_COUNT=$(echo "$CONTROL_SERVERS" | jq length) - + if [[ $SERVER_COUNT -gt 1 ]]; then - echo "Warning: More than one server found for control node '${cluster_prefix}-control'." - continue - fi - TAGS=$(echo "$CONTROL_SERVERS" | jq -r '.[0].Tags' ) - - if [[ $TAGS =~ "keep" ]]; then - echo "Skipping ${cluster_prefix} - control instance is tagged as keep" + echo "Multiple servers found for control node '${cluster_prefix}-control'. Checking tags for each..." + + for server in $(echo "$CONTROL_SERVERS" | jq -r '.[].ID'); do + # Get tags for each control node + TAGS=$(openstack server show "$server" --column tags --format value) + + if [[ $TAGS =~ "keep" ]]; then + echo "Skipping ${cluster_prefix} (server ${server}) - control instance is tagged as keep" + else + ./dev/delete-cluster.py ${cluster_prefix} --force + fi + done else - ./dev/delete-cluster.py ${cluster_prefix} --force + # If only one server, extract its tags and proceed + TAGS=$(echo "$CONTROL_SERVERS" | jq -r '.[0].Tags') + if [[ $TAGS =~ "keep" ]]; then + echo "Skipping ${cluster_prefix} - control instance is tagged as keep" + else + ./dev/delete-cluster.py ${cluster_prefix} --force + fi fi done shell: bash From fd5cbf992bfa9aca2e018e2051172a8348e2ec70 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 14 Jan 2025 09:25:07 +0000 Subject: [PATCH 174/182] pause in workflow to debug slurm state --- .github/workflows/stackhpc.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index d5bd313ca..35630d4dc 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -185,6 +185,9 @@ jobs: ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml ansible-playbook -v ansible/ci/check_slurm.yml + - name: Pause for debugging + run: sleep 1800 + - name: Check sacct state survived reimage run: | . venv/bin/activate From f661c7fef6a741fe715d24815f7350b66d2e64ea Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 14 Jan 2025 10:49:01 +0000 Subject: [PATCH 175/182] debug wait on failure --- .github/workflows/stackhpc.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 35630d4dc..f8b0167ae 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -186,7 +186,8 @@ jobs: ansible-playbook -v ansible/ci/check_slurm.yml - name: Pause for debugging - run: sleep 1800 + if: failure() + run: sleep 3600 - name: Check sacct state survived reimage run: | From 329e054742e1d8ddbf7670dfbfddbdf735fa0470 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Tue, 14 Jan 2025 12:45:29 +0100 Subject: [PATCH 176/182] Fix environment creation steps We need to be at the root of the repository to run the next commands. --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index dd6451011..1a0acd630 100644 --- a/README.md +++ b/README.md @@ -68,8 +68,9 @@ and follow the prompts to complete the environment name and description. **NB:** In subsequent sections this new environment is referred to as `$ENV`. -Activate the new environment: +Go back to the root folder and activate the new environment: + cd .. . environments/$ENV/activate And generate secrets for it: From 81c316a594aa3bc602350d80ea31e4731c11d001 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 14 Jan 2025 15:40:33 +0000 Subject: [PATCH 177/182] allow empty compute_init_enable list --- .github/workflows/stackhpc.yml | 4 ---- ansible/extras.yml | 1 - .../{{cookiecutter.environment}}/terraform/compute.tf | 4 ++-- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index f8b0167ae..d5bd313ca 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -185,10 +185,6 @@ jobs: ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml ansible-playbook -v ansible/ci/check_slurm.yml - - name: Pause for debugging - if: failure() - run: sleep 3600 - - name: Check sacct state survived reimage run: | . venv/bin/activate diff --git a/ansible/extras.yml b/ansible/extras.yml index 6bb141109..13a887dd9 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -44,7 +44,6 @@ # NB: has to be after eeesi and os-manila-mount tags: compute_init become: yes - name: Export hostvars tasks: - include_role: name: compute_init diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index 20fcd5d89..a90108924 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -20,11 +20,11 @@ module "compute" { root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) extra_volumes = lookup(each.value, "extra_volumes", {}) + compute_init_enable = lookup(each.value, "compute_init_enable", []) + key_pair = var.key_pair environment_root = var.environment_root k3s_token = var.k3s_token control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] - - compute_init_enable = each.value.compute_init_enable } From 9897f29b7220a6f7bce6b06a2da41c6b2d068158 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 14 Jan 2025 17:04:16 +0000 Subject: [PATCH 178/182] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 3c43e02eb..37bd8c3d6 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250109-2102-5193ba2f", - "RL9": "openhpc-RL9-250110-0016-5193ba2f" + "RL8": "openhpc-RL8-250114-1627-bccc88b5", + "RL9": "openhpc-RL9-250114-1626-bccc88b5" } } From 257e685aa151098c5da007a032e99424b49938ff Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Wed, 15 Jan 2025 11:00:37 +0100 Subject: [PATCH 179/182] Document required security groups (#534) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1a0acd630..54b74d799 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ Before starting ensure that: - You have an SSH keypair defined in OpenStack, with the private key available on the deploy host. - Created instances have access to internet (note proxies can be setup through the appliance if necessary). - Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance). +- Three security groups are present: ``default`` allowing intra-cluster communication, ``SSH`` allowing external access via SSH and ``HTTPS`` allowing access for Open OnDemand. ### Setup deploy host From e8f1cbe6237cb8a692ef426e8abcc97e2bcc4393 Mon Sep 17 00:00:00 2001 From: Matt Anson Date: Wed, 15 Jan 2025 10:01:53 +0000 Subject: [PATCH 180/182] Bump Zenith client to latest from azimuth-cloud namespace (#437) --- ansible/roles/zenith_proxy/defaults/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/roles/zenith_proxy/defaults/main.yml b/ansible/roles/zenith_proxy/defaults/main.yml index dbb920c58..6b1a43aaa 100644 --- a/ansible/roles/zenith_proxy/defaults/main.yml +++ b/ansible/roles/zenith_proxy/defaults/main.yml @@ -15,12 +15,12 @@ zenith_proxy_pod_name: "{{ zenith_proxy_service_name }}" zenith_proxy_client_container_name: "{{ zenith_proxy_client_service_name }}" zenith_proxy_mitm_container_name: "{{ zenith_proxy_mitm_service_name }}" -zenith_proxy_image_tag: '0.1.0' +zenith_proxy_image_tag: '0.12.0' -zenith_proxy_client_image_repository: ghcr.io/stackhpc/zenith-client +zenith_proxy_client_image_repository: ghcr.io/azimuth-cloud/zenith-client zenith_proxy_client_image: "{{ zenith_proxy_client_image_repository }}:{{ zenith_proxy_image_tag }}" -zenith_proxy_mitm_image_repository: ghcr.io/stackhpc/zenith-proxy +zenith_proxy_mitm_image_repository: ghcr.io/azimuth-cloud/zenith-proxy zenith_proxy_mitm_image: "{{ zenith_proxy_mitm_image_repository }}:{{ zenith_proxy_image_tag }}" zenith_proxy_upstream_scheme: http From 1e5e105da8b35ef74150c87ee118063750cf69bb Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 15 Jan 2025 10:34:27 +0000 Subject: [PATCH 181/182] fix yaml formatting in operations docs --- docs/operations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/operations.md b/docs/operations.md index 4bebe1b3f..595ddcbf5 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -82,7 +82,7 @@ Additional packages from any DNF repositories which are enabled during build (wh appliances_extra_packages_other: - somepackage - anotherpackage - +``` The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the [OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note "user-facing" OpenHPC packages such as compilers, mpi libraries etc. include corresponding `lmod` modules. From 5f7e48fbdb5f0c8843cefb7ef35cc4c23baf9f9e Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 15 Jan 2025 12:56:38 +0000 Subject: [PATCH 182/182] Enable image builds to install extra packages by default (#536) * Enable image builds to install extra packages by default * simplify adding additional packages * Fix docs typo Co-authored-by: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> --------- Co-authored-by: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> --- ansible/extras.yml | 1 - docs/operations.md | 42 +++++++++++++------ .../inventory/group_vars/all/defaults.yml | 3 -- environments/common/inventory/groups | 1 + environments/common/layouts/everything | 2 +- 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/ansible/extras.yml b/ansible/extras.yml index 13a887dd9..72c76b3b1 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -65,4 +65,3 @@ - name: Install additional packages dnf: name: "{{ appliances_extra_packages }}" - when: appliances_mode != 'configure' or appliances_extra_packages_during_configure diff --git a/docs/operations.md b/docs/operations.md index 595ddcbf5..7a0a5b919 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -63,7 +63,7 @@ This is a usually a two-step process: Deploying the additional nodes and applying these changes requires rerunning both Terraform and the Ansible site.yml playbook - follow [Deploying a Cluster](#Deploying-a-Cluster). # Adding Additional Packages -By default, the following utility packages are installed during build: +By default, the following utility packages are installed during the StackHPC image build: - htop - nano - screen @@ -75,18 +75,34 @@ By default, the following utility packages are installed during build: - git - latest python version for system (3.6 for for Rocky 8.9 and 3.12 for Rocky 9.4) -Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_extra_packages_other` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: - -```yaml - # environments/foo-base/inventory/group_vars/all/defaults.yml: - appliances_extra_packages_other: - - somepackage - - anotherpackage -``` - -The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the [OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note "user-facing" OpenHPC packages such as compilers, mpi libraries etc. include corresponding `lmod` modules. - -If you wish to install packages during runtime, the `site.yml` playbook should be run with `appliances_packages_during_configure` overriden to `true` and `cluster` should be added as a child of the `dnf_repos` group in order to temporarily re-enable DNF repositories during runtime (WARNING: this should only be done if using an unauthenticated local Pulp server. If using StackHPC Ark directly, doing this WILL leak credentials to users). +Additional packages can be added during image builds by: +- adding the `extra_packages` group to the build `inventory_groups` (see +[docs/image-build.md](./image-build.md)) +- defining a list of packages in `appliances_extra_packages_other` in e.g. +`environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: + + ```yaml + # environments/foo-base/inventory/group_vars/all/defaults.yml: + appliances_extra_packages_other: + - somepackage + - anotherpackage + ``` + +For packages which come from repositories mirroed by StackHPC's "Ark" Pulp server +(including rocky, EPEL and OpenHPC repositories), this will require either [Ark +credentials](./image-build.md)) or a [local Pulp mirror](./experimental/pulp.md) +to be configured. This includes rocky, EPEL and OpenHPC repos. + +The packages available from the OpenHPC repos are described in Appendix E of +the OpenHPC installation guide (linked from the +[OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note +"user-facing" OpenHPC packages such as compilers, mpi libraries etc. include +corresponding `lmod` modules. + +Packages *may* also be installed during the site.yml, by adding the `cluster` +group into the `extra_packages` group. An error will occur if Ark credentials +are defined in this case, as they are readable by unprivileged users in the +`.repo` files and a local Pulp mirror must be used instead. If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a plat should NOT exclude the builder group, so that the repositories are also added to built images. There are various Ansible modules which might be useful for this: - `ansible.builtin.yum_repository`: Add a repo from an URL providing a 'repodata' directory. diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index e26bc3018..23aafd73e 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -94,9 +94,6 @@ appliances_extra_packages_default: - git - "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}" - -appliances_extra_packages_during_configure: false - appliances_extra_packages_other: [] appliances_extra_packages: "{{ appliances_extra_packages_default + appliances_extra_packages_other }}" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 1d756ed66..cb49b92e2 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -161,6 +161,7 @@ freeipa_client # Hosts to replace system repos with Pulp repos # Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users builder +extra_packages [pulp] # Add builder to this group to enable automatically syncing of pulp during image build diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 4293cbca0..8b5046bfc 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -110,4 +110,4 @@ control [extra_packages:children] # Hosts to install specified additional packages on -cluster +builder