diff --git a/input/config/rhel/9.4/openmpi.json b/input/config/rhel/9.4/openmpi.json new file mode 100644 index 000000000..231e33aef --- /dev/null +++ b/input/config/rhel/9.4/openmpi.json @@ -0,0 +1,12 @@ +{ + "openmpi": { + "cluster": [ + { "package": "openmpi", + "type": "tarball", + "url": "https://download.open-mpi.org/release/open-mpi/v{{ openmpi_version.split('.')[:2] | join('.') }}/openmpi-{{ openmpi_version }}.tar.gz" + }, + {"package": "gcc-c++", "type": "rpm", "repo_name": "appstream"}, + {"package": "clang", "type": "rpm", "repo_name": "appstream"} + ] + } +} diff --git a/input/config/rhel/9.4/slurm.json b/input/config/rhel/9.4/slurm.json new file mode 100644 index 000000000..165ac4357 --- /dev/null +++ b/input/config/rhel/9.4/slurm.json @@ -0,0 +1,32 @@ +{ + "slurm": { + "cluster": [ + {"package": "munge", "type": "rpm", "repo_name": "appstream"}, + {"package": "firewalld", "type": "rpm", "repo_name": "baseos"}, + {"package": "python3-firewall", "type": "rpm", "repo_name": "baseos"} + ] + }, + "slurm_control_node": { + "cluster": [ + {"package": "slurm-slurmctld", "type": "rpm", "repo_name": "epel"} + ] + }, + "slurm_node": { + "cluster": [ + {"package": "slurm-slurmd", "type": "rpm", "repo_name": "epel"} + ] + }, + "slurmdbd":{ + "cluster": [ + {"package": "slurm-slurmdbd", "type": "rpm", "repo_name": "epel"}, + {"package": "python3-PyMySQL", "type": "rpm", "repo_name": "appstream"}, + {"package": "mysql-server", "type": "rpm", "repo_name": "appstream"}, + {"package": "mariadb-server", "type": "rpm", "repo_name": "appstream"} + ] + }, + "login":{ + "cluster": [ + {"package": "slurm", "type": "rpm", "repo_name": "epel"} + ] + } +} \ No newline at end of file diff --git a/input/config/rhel/9.4/ucx.json b/input/config/rhel/9.4/ucx.json new file mode 100644 index 000000000..2f91a5ab5 --- /dev/null +++ b/input/config/rhel/9.4/ucx.json @@ -0,0 +1,11 @@ +{ + "ucx": { + "cluster": [ + { "package": "ucx", + "type": "tarball", + "url": "https://github.com/openucx/ucx/releases/download/v{{ ucx_version }}/ucx-{{ ucx_version }}.tar.gz" + }, + {"package": "gcc-c++", "type": "rpm", "repo_name": "appstream"} + ] + } +} diff --git a/input/omnia_config.yml b/input/omnia_config.yml index cf5add78c..eea416651 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -26,10 +26,22 @@ ansible_config_file_path: "/etc/ansible" # -----------------------------SLURM------------------------------------------------ -# Password used for Slurm database. +# Username and password used for Slurm database. # The Length of the password should be at least 8. # The password must not contain -,\, '," -mariadb_password: "password" +slurm_db_username: root +slurm_db_password: "" + +# Host and port of the Slurm database. +# If no database is intalled on the given node, one will be created. +# Defaults to the slurmdbd host if no host is given. +# Defaults to 3306 if no port is given. +slurm_db_host: +slurm_db_port: 3306 + +# Type of database to be used by Slurm. +# Options are mysql or mariadb. Defaults to mariadb. +slurm_db_type: mariadb # This variable accepts whether slurm installation is supported in configless mode or slurm in nfs # Default value is "configless" diff --git a/scheduler/add_node.yml b/scheduler/add_node.yml new file mode 100644 index 000000000..ca0c3ef36 --- /dev/null +++ b/scheduler/add_node.yml @@ -0,0 +1,23 @@ +--- +- name: Add nodes Slurm + hosts: slurm_control_node, slurm_node, login, slurm_dbd + any_errors_fatal: true + vars: + share_mounted_path: "{{ hostvars['localhost']['share_path'] | default('/home') }}" # from storage.yml + #TODO: nfs_client role here, slurm depends on a mandatory share path + pre_tasks: + - name: Include input project directory + ansible.builtin.import_role: + name: ../utils/roles/include_input_dir + run_once: true + delegate_to: localhost + - name: Include vars omnia_config.yml + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/omnia_config.yml" + run_once: true + delegate_to: localhost + tasks: + - name: Add node + ansible.builtin.import_role: + name: slurm + tasks_from: add_node diff --git a/scheduler/remove_node.yml b/scheduler/remove_node.yml new file mode 100644 index 000000000..61dffca55 --- /dev/null +++ b/scheduler/remove_node.yml @@ -0,0 +1,23 @@ +--- +- name: Remove nodes Slurm + hosts: slurm_control_node, slurm_node + any_errors_fatal: true + vars: + share_mounted_path: "{{ hostvars['localhost']['share_path'] | default('/home') }}" # from storage.yml + #TODO: nfs_client role here, slurm depends on a mandatory share path + pre_tasks: + - name: Include input project directory + ansible.builtin.import_role: + name: ../utils/roles/include_input_dir + run_once: true + delegate_to: localhost + - name: Include vars omnia_config.yml + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/omnia_config.yml" + run_once: true + delegate_to: localhost + tasks: + - name: Remove node + ansible.builtin.import_role: + name: slurm + tasks_from: rm_node diff --git a/scheduler/roles/install_benchmarks_tools/tasks/compile_install_openmpi.yml b/scheduler/roles/install_benchmarks_tools/tasks/compile_install_openmpi.yml index 4008f2299..907c9dea5 100644 --- a/scheduler/roles/install_benchmarks_tools/tasks/compile_install_openmpi.yml +++ b/scheduler/roles/install_benchmarks_tools/tasks/compile_install_openmpi.yml @@ -85,13 +85,9 @@ - ucx_dir_data.stat.exists - ucx_cmd.rc == 0 - - name: Construct the command to compile the openmpi when slurm support is true - when: slurm_support - ansible.builtin.include_tasks: openmpi_cmd_with_slurm.yml - - - name: Construct the command to compile the openmpi when slurm support is false - when: not slurm_support - ansible.builtin.include_tasks: openmpi_cmd_without_slurm.yml + - name: Construct openmpi compile the command with ucx and slurm + ansible.builtin.set_fact: + openmpi_compile_cmd: "./configure --prefix={{ omnia_share_path }}/{{ benchmarks_dir_openmpi }} --enable-mpi1-compatibility --enable-prte-prefix-by-default {{ '--with-slurm=yes' if slurm_support else '--with-slurm=no' }}{{ ' --with-ucx='+ucx_dir_data.stat.path+' ' if ucx_installed else ' ' }}CC=clang CXX=clang++ 2>&1 | tee config.out" - name: Create a build directory inside openmpi folder ansible.builtin.file: diff --git a/scheduler/roles/install_benchmarks_tools/tasks/openmpi_cmd_with_slurm.yml b/scheduler/roles/install_benchmarks_tools/tasks/openmpi_cmd_with_slurm.yml deleted file mode 100644 index 2df0c9152..000000000 --- a/scheduler/roles/install_benchmarks_tools/tasks/openmpi_cmd_with_slurm.yml +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - - -- name: Set the omnia share path - ansible.builtin.set_fact: - slurm_install_type: "{{ hostvars['127.0.0.1']['slurm_installation_type'] }}" - -- name: Check if nfs slurm installation type is nfs share - ansible.builtin.set_fact: - slurm_installed_path: "{{ omnia_share_path }}/{{ slurm_dir }}/{{ usr_dir }}" - when: slurm_install_type == "nfs_share" - -- name: Check if nfs slurm installation type is configless - ansible.builtin.set_fact: - slurm_installed_path: "/{{ usr_dir }}" - when: slurm_install_type == "configless" - -- name: Construct openmpi compile the command with ucx - ansible.builtin.set_fact: - openmpi_compile_cmd: "../configure --prefix={{ omnia_share_path }}/{{ benchmarks_dir_openmpi }} --enable-mpi1-compatibility - --enable-orterun-prefix-by-default - --with-pmix - --with-slurm={{ slurm_installed_path }} - --with-ucx={{ omnia_share_path }}/{{ benchmarks_dir_ucx }} - CC=clang CXX=clang++ 2>&1 | tee config.out" - when: - - ucx_installed - - ansible_distribution | lower is in supported_os_type - - ansible_distribution_version in supported_os_version - -- name: Construct openmpi compile the command with ucx - ansible.builtin.set_fact: - openmpi_compile_cmd: "../configure --prefix={{ omnia_share_path }}/{{ benchmarks_dir_openmpi }} --enable-mpi1-compatibility - --enable-orterun-prefix-by-default - --with-pmi - --with-slurm={{ slurm_installed_path }} - --with-ucx={{ omnia_share_path }}/{{ benchmarks_dir_ucx }} - CC=clang CXX=clang++ 2>&1 | tee config.out" - when: - - ucx_installed - - ansible_distribution | lower == "ubuntu" - - ansible_distribution_version == "22.04" - - -- name: Construct the command to compile the openmpi without ucx - ansible.builtin.set_fact: - openmpi_compile_cmd: "../configure --prefix={{ omnia_share_path }}/{{ benchmarks_dir_openmpi }} --enable-mpi1-compatibility - --enable-orterun-prefix-by-default --with-pmix --with-slurm={{ slurm_installed_path }} CC=clang CXX=clang++ 2>&1 | tee config.out " - when: - - not ucx_installed - - ansible_distribution | lower is in supported_os_type - - ansible_distribution_version in supported_os_version - -- name: Construct the command to compile the openmpi without ucx - ansible.builtin.set_fact: - openmpi_compile_cmd: "../configure --prefix={{ omnia_share_path }}/{{ benchmarks_dir_openmpi }} --enable-mpi1-compatibility - --enable-orterun-prefix-by-default --with-pmi --with-slurm={{ slurm_installed_path }} CC=clang CXX=clang++ 2>&1 | tee config.out " - when: - - not ucx_installed - - ansible_distribution | lower == "ubuntu" - - ansible_distribution_version == "22.04" diff --git a/scheduler/roles/install_benchmarks_tools/tasks/openmpi_cmd_without_slurm.yml b/scheduler/roles/install_benchmarks_tools/tasks/openmpi_cmd_without_slurm.yml deleted file mode 100644 index f2adfc4fe..000000000 --- a/scheduler/roles/install_benchmarks_tools/tasks/openmpi_cmd_without_slurm.yml +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -- name: Prepare for Openmpi compilation on RHEL/Rocky Linux - when: ansible_distribution | lower in [redhat_os, rocky_os] - block: - - name: Prepare for Openmpi compilation with ucx and without slurm - ansible.builtin.set_fact: - openmpi_compile_cmd: "../configure --prefix={{ omnia_share_path }}/{{ benchmarks_dir_openmpi }} --enable-mpi1-compatibility - --enable-orterun-prefix-by-default --with-ucx={{ omnia_share_path }}/{{ benchmarks_dir_ucx }} - CC=clang CXX=clang++ 2>&1 | tee config.out " - when: - - ucx_installed - - - name: Prepare for Openmpi compilation without slurm and ucx - ansible.builtin.set_fact: - openmpi_compile_cmd: "../configure --prefix={{ omnia_share_path }}/{{ benchmarks_dir_openmpi }} --enable-mpi1-compatibility - --enable-orterun-prefix-by-default CC=clang CXX=clang++ 2>&1 | tee config.out " - when: - - not ucx_installed - -- name: Prepare for Openmpi compilation on 'Ubuntu' Linux - when: ansible_distribution | lower == ubuntu_os - block: - - name: Prepare for compilation of openmpi with ucx and without slurm - ansible.builtin.set_fact: - openmpi_compile_cmd: "../configure --prefix={{ omnia_share_path }}/{{ benchmarks_dir_openmpi }} --enable-mpi1-compatibility - --enable-orterun-prefix-by-default --with-ucx={{ omnia_share_path }}/{{ benchmarks_dir_ucx }} - CC=gcc CXX=g++ 2>&1 | tee config.out " - when: - - ucx_installed - - - name: Prepare for Openmpi compilation without slurm and ucx - ansible.builtin.set_fact: - openmpi_compile_cmd: "../configure --prefix={{ omnia_share_path }}/{{ benchmarks_dir_openmpi }} --enable-mpi1-compatibility - --enable-orterun-prefix-by-default CC=gcc CXX=gcc++ 2>&1 | tee config.out " - when: - - not ucx_installed diff --git a/scheduler/roles/slurm/defaults/main.yml b/scheduler/roles/slurm/defaults/main.yml new file mode 100644 index 000000000..aa8315c87 --- /dev/null +++ b/scheduler/roles/slurm/defaults/main.yml @@ -0,0 +1,49 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +mpi: {} +cgroup: {} +__cgroup_default_config: + CgroupPlugin: autodetect + ConstrainCores: true + ConstrainDevices: true + ConstrainRAMSpace: true + ConstrainSwapSpace: true +__slurm_default_config: + SlurmUser: "{{ slurm_user }}" + SlurmctldPort: 6817 + SlurmdPort: 6818 + SrunPortRange: "60001-63000" + StateSaveLocation: "/var/spool/state" + SlurmdSpoolDir: "/var/spool/slurmd" + ReturnToService: 2 + SchedulerType: sched/backfill + MpiDefault: None + ProctrackType: proctrack/cgroup + SelectType: select/linear + SlurmctldLogFile: /var/log/slurmctld.log + SlurmdLogFile: /var/log/slurmd.log + SlurmctldPidFile: /var/run/slurmctld.pid + SlurmdPidFile: /var/run/slurmd.pid + AuthType: auth/munge + CryptoType: crypto/munge + SlurmctldTimeout: 120 + SlurmdTimeout: 300 +__slurm_dbd_default_config: + AuthType: auth/munge + LogFile: /var/log/slurmdbd.log + PidFile: /var/run/slurmdbd.pid + SlurmUser: "{{ slurm_user }}" + StorageType: accounting_storage/mysql + StorageLoc: slurm_acct_db diff --git a/scheduler/roles/slurm/handlers/main.yml b/scheduler/roles/slurm/handlers/main.yml new file mode 100644 index 000000000..b4175ef85 --- /dev/null +++ b/scheduler/roles/slurm/handlers/main.yml @@ -0,0 +1,82 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Restart chrony + ansible.builtin.systemd_service: + name: "{{ 'chronyd' if ansible_os_family == 'RedHat' else 'chrony' }}" + state: restarted + enabled: true + +- name: Restart munge + ansible.builtin.systemd_service: + name: munge + state: restarted + +- name: Restart mysqld + ansible.builtin.systemd_service: + name: mysqld + state: restarted + run_once: true + when: restart_slurm_services + delegate_to: "{{ slurm_db_host }}" + +- name: Restart mariadb + ansible.builtin.systemd_service: + name: mariadb + state: restarted + run_once: true + when: restart_slurm_services + delegate_to: "{{ slurm_db_host }}" + +- name: Reload slurmdbd + ansible.builtin.systemd_service: + name: slurmdbd + state: reloaded + daemon_reload: true + enabled: true + when: restart_slurm_services and ('slurm_dbd' in group_names) + +- name: Restart slurmdbd + ansible.builtin.systemd_service: + name: slurmdbd + state: restarted + when: restart_slurm_services and ('slurm_dbd' in group_names) + +- name: Reload slurmctld + ansible.builtin.systemd_service: + name: slurmctld + state: reloaded + daemon_reload: true + enabled: true + when: restart_slurm_services and ('slurm_control_node' in group_names) + +- name: Restart slurmctld + ansible.builtin.systemd_service: + name: slurmctld + state: restarted + when: restart_slurm_services and ('slurm_control_node' in group_names) + +- name: Reload slurmd + ansible.builtin.systemd_service: + name: slurmd + state: reloaded + daemon_reload: true + enabled: true + when: restart_slurm_services and ('slurm_node' in group_names) + +- name: Restart slurmd + ansible.builtin.systemd_service: + name: slurmd + state: restarted + when: restart_slurm_services and ('slurm_node' in group_names) diff --git a/scheduler/roles/slurm/meta/main.yml b/scheduler/roles/slurm/meta/main.yml new file mode 100644 index 000000000..9efbcbf01 --- /dev/null +++ b/scheduler/roles/slurm/meta/main.yml @@ -0,0 +1,9 @@ +--- +galaxy_info: + author: Jagadeesh N V + description: Omnia + company: Dell + license: Apache-2.0 + min_ansible_version: 2.1 + galaxy_tags: [] +dependencies: [] diff --git a/scheduler/roles/slurm/tasks/_config_files.yml b/scheduler/roles/slurm/tasks/_config_files.yml new file mode 100644 index 000000000..af25438a8 --- /dev/null +++ b/scheduler/roles/slurm/tasks/_config_files.yml @@ -0,0 +1,117 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Append share path if NFS + ansible.builtin.set_fact: + slurm_share_prefix: "{{ share_path }}/{{ slurm_dir_name }}" + +- name: Ensure Slurm conf share dir + ansible.builtin.file: + path: "{{ slurm_share_prefix }}{{ slurm_config_dir }}" + state: directory + owner: root + group: root + mode: "{{ common_mode }}" + +- name: Ensure directories for slurm ops + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: "{{ slurm_user }}" + group: "{{ slurm_user_group }}" + mode: "{{ common_mode }}" + loop: + - "{{ slurm_conf_dict['StateSaveLocation'] }}" + - "{{ slurm_config_dir }}" + +- name: Create slurm.conf + ansible.builtin.template: # Create dest directory before + src: "slurm.conf.j2" + dest: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" + # owner: root + # group: root + mode: "{{ conf_file_mode }}" + run_once: true + notify: + - Restart slurmctld + +- name: Create cgroup.conf + ansible.builtin.template: # Create dest directory before + src: "cgroup.conf.j2" + dest: "{{ item }}" + owner: root + group: root + mode: "{{ conf_file_mode }}" + loop: + - "{{ slurm_share_prefix }}{{ slurm_config_dir }}/cgroup.conf" + - "{{ slurm_config_dir }}/cgroup.conf" + notify: + - Restart slurmctld + - Restart slurmdbd + +# Environment Var SLURM_CONF +- name: Set env variable + community.general.ini_file: + path: "/etc/environment" + option: SLURM_CONF + value: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" + no_extra_spaces: true + mode: u=rw,g=r,o=r + +- name: All other host configs # TODO: failsafe + ansible.builtin.template: + src: "all_other.conf.j2" + dest: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/{{ item }}.conf" + mode: "{{ conf_file_mode }}" + backup: true + loop: + - mpi + - acct_gather + - gres + when: item in vars + notify: + - Reload slurmctld + - Reload slurmd + +- name: Local copy slurm.conf /etc/slurm/ + ansible.builtin.template: # Local copy # FAILSAFE + src: "{{ item }}.conf.j2" + dest: "{{ slurm_config_dir }}/{{ item }}.conf" + owner: root + group: root + mode: "{{ conf_file_mode }}" + ignore_errors: true + register: ignore_errors_register + loop: + - slurm + - cgroup + notify: + - Restart slurmctld + +- name: All other host local configs # FAILSAFE + ansible.builtin.template: + src: "all_other.conf.j2" + dest: "{{ slurm_config_dir }}/{{ item }}.conf" + mode: "{{ conf_file_mode }}" + backup: true + ignore_errors: true + register: ignore_errors_register + loop: + - mpi + - acct_gather + - gres + when: item in vars + notify: + - Reload slurmctld + - Reload slurmd diff --git a/scheduler/roles/slurm/tasks/_configless.yml b/scheduler/roles/slurm/tasks/_configless.yml new file mode 100644 index 000000000..b45831fed --- /dev/null +++ b/scheduler/roles/slurm/tasks/_configless.yml @@ -0,0 +1,94 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Enable the service + ansible.builtin.service: + name: slurmd + enabled: true + register: d_service + +- name: Get service path + ansible.builtin.set_fact: + slurmd_service_path: "{{ d_service['status']['FragmentPath'] | default('/usr/lib/systemd/system/slurmd.service') }}" + +- name: Edit the service !ConditionPathExists + community.general.ini_file: + path: "{{ slurmd_service_path }}" + section: Unit + option: ConditionPathExists + state: absent + mode: u=rw,g=r,o=r + when: slurm_installation_type == 'configless' + notify: + - Reload slurmd + - Restart slurmd + +- name: Edit the service ExecStart + community.general.ini_file: + path: "{{ slurmd_service_path }}" + section: Service + option: ExecStart + value: "/usr/sbin/slurmd -D -s $SLURMD_OPTIONS + --conf-server {{ (hostvars[groups['slurm_control_node'] | first])['inventory_hostname_short'] + + ':' + (slurm_conf_dict['SlurmctldPort'] + | string) }}" # HA + no_extra_spaces: true + mode: u=rw,g=r,o=r + when: slurm_installation_type == 'configless' + notify: + - Reload slurmd + - Restart slurmd + +- name: Set env variable + community.general.ini_file: + path: "/etc/environment" + option: SLURM_CONF + state: absent + mode: u=rw,g=r,o=r + when: slurm_installation_type == 'configless' + +- name: Edit the service ConditionPathExists + community.general.ini_file: + path: "{{ slurmd_service_path }}" + section: Unit + option: ConditionPathExists + value: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" + no_extra_spaces: true + mode: u=rw,g=r,o=r + when: slurm_installation_type != 'configless' + notify: + - Reload slurmd + +- name: Edit the service to fetch specific conf + community.general.ini_file: + path: "{{ slurmd_service_path }}" + section: Service + option: ExecStart + value: "/usr/sbin/slurmd -D -s $SLURMD_OPTIONS -f {{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" + no_extra_spaces: true + mode: u=rw,g=r,o=r + when: slurm_installation_type != 'configless' + notify: + - Reload slurmd + +- name: Set env variable + community.general.ini_file: + path: "/etc/environment" + option: SLURM_CONF + value: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" + no_extra_spaces: true + mode: u=rw,g=r,o=r + when: slurm_installation_type != 'configless' + notify: + - Restart slurmd \ No newline at end of file diff --git a/scheduler/roles/slurm/tasks/_edit_conf.yml b/scheduler/roles/slurm/tasks/_edit_conf.yml new file mode 100644 index 000000000..fc3550c47 --- /dev/null +++ b/scheduler/roles/slurm/tasks/_edit_conf.yml @@ -0,0 +1,54 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Append share path if NFS + ansible.builtin.set_fact: + slurm_share_prefix: "{{ share_path }}/{{ slurm_dir_name }}" + +- name: Check if line exists + ansible.builtin.lineinfile: + path: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" + state: absent + line: "NodeName={{ item }}" + regexp: "^NodeName={{ item }} " + register: node_check + check_mode: true + loop: "{{ groups['slurm_node'] }}" + +- name: Insert Node + ansible.builtin.lineinfile: + path: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" + state: present + line: "NodeName={{ hostvars[item.item]['inventory_hostname_short'] }} CPUs={{ hostvars[item.item]['ansible_facts']['processor_nproc'] }} Sockets={{ hostvars[item.item]['ansible_facts']['processor_count'] }} CoresPerSocket={{ hostvars[item.item]['ansible_facts']['processor_cores'] }} ThreadsPerCore={{ hostvars[item.item]['ansible_facts']['processor_threads_per_core'] }}" + insertafter: "^NodeName=" + register: node_check + when: not item.changed + loop: "{{ node_check.results }}" + +- name: Get slurm info output if slurm is installed + ansible.builtin.shell: scontrol show node | awk '/NodeName/ {print $1}' | cut -d= -f2 + register: slurm_nodes + +- name: Assemble all slurm_nodes + ansible.builtin.set_fact: + all_nodes: "{{ (groups['slurm_node'] + slurm_nodes['stdout_lines']) | unique | sort }}" + +- name: Add the partition + ansible.builtin.lineinfile: + path: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" + state: present + line: "PartitionName={{ slurm_partition_name }} Nodes={{ all_nodes | join(',') }} MaxTime=INFINITE State=UP" + regexp: "^PartitionName={{ slurm_partition_name }} " + register: partition_check + diff --git a/scheduler/roles/slurm/tasks/_munge.yml b/scheduler/roles/slurm/tasks/_munge.yml new file mode 100644 index 000000000..d6c648220 --- /dev/null +++ b/scheduler/roles/slurm/tasks/_munge.yml @@ -0,0 +1,62 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Install munge packages + ansible.builtin.package: + name: "{{ munge_packages[ansible_os_family] }}" + state: present + +- name: Check if slurm munge key exists on share + ansible.builtin.stat: + path: "{{ slurm_munge_key_path | default('') }}" + register: slurm_key_path + +- name: Set the munge key path + ansible.builtin.set_fact: + slurm_munge_key_path: "{{ share_path }}/{{ slurm_dir_name }}/munge.key" + when: not slurm_key_path.stat.exists + +- name: Generate munge key + ansible.builtin.command: "dd if=/dev/random of={{ slurm_munge_key_path }} bs=1024 count=1" + when: not slurm_key_path.stat.exists + run_once: true + register: munge_output + changed_when: munge_output.rc == 0 + failed_when: munge_output.rc != 0 + +- name: Wait until the shared munge key file exists + ansible.builtin.wait_for: + path: "{{ slurm_munge_key_path }}" + state: present + timeout: 300 + +- name: Check munge dir + ansible.builtin.file: + path: /etc/munge + state: directory + +- name: Copy munge key + ansible.builtin.copy: + src: "{{ slurm_munge_key_path }}" + dest: /etc/munge/munge.key + owner: munge + group: munge + remote_src: true + mode: "{{ munge_mode }}" + +- name: Ensure Munge is enabled and running + ansible.builtin.service: + name: munge + enabled: true + state: started diff --git a/scheduler/roles/slurm/tasks/_rm_conf.yml b/scheduler/roles/slurm/tasks/_rm_conf.yml new file mode 100644 index 000000000..67dd14240 --- /dev/null +++ b/scheduler/roles/slurm/tasks/_rm_conf.yml @@ -0,0 +1,51 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Append share path if NFS + ansible.builtin.set_fact: + slurm_share_prefix: "{{ share_path }}/{{ slurm_dir_name }}" + +- name: Remove the slurm nodes + ansible.builtin.lineinfile: + path: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" + state: absent + regexp: "^NodeName={{ item }} " + register: node_check + loop: "{{ groups['slurm_node'] }}" + +- name: Get slurm info output if slurm is installed + ansible.builtin.shell: scontrol show node | awk '/NodeName/ {print $1}' | cut -d= -f2 + register: slurm_nodes + +- name: Assemble all slurm_nodes + ansible.builtin.set_fact: + rem_nodes: "{{ (slurm_nodes['stdout_lines'] | difference(groups['slurm_node'])) | unique | sort }}" + +- name: Edit the partition + ansible.builtin.lineinfile: + path: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" + state: present + line: "PartitionName={{ slurm_partition_name }} Nodes={{ rem_nodes | join(',') }} MaxTime=INFINITE State=UP" + regexp: "^PartitionName={{ slurm_partition_name }} " + when: rem_nodes + register: partition_check + +- name: Edit the partition + ansible.builtin.lineinfile: + path: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" + state: present + line: 'PartitionName={{ slurm_partition_name }} Nodes="" MaxTime=INFINITE State=UP' + regexp: "^PartitionName={{ slurm_partition_name }} " + when: not rem_nodes + register: partition_check diff --git a/scheduler/roles/slurm/tasks/_user.yml b/scheduler/roles/slurm/tasks/_user.yml new file mode 100644 index 000000000..1152960ce --- /dev/null +++ b/scheduler/roles/slurm/tasks/_user.yml @@ -0,0 +1,26 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Create slurm group + ansible.builtin.group: + name: slurm + state: present + +- name: Add the user 'slurm' with uid 6001 and a primary group of 'slurm' + ansible.builtin.user: + name: slurm + comment: Slurm User Account + uid: "{{ slurm_uid }}" + group: slurm + create_home: false # TODO: if share path is /home this is needed diff --git a/scheduler/roles/slurm/tasks/add_node.yml b/scheduler/roles/slurm/tasks/add_node.yml new file mode 100644 index 000000000..5c31896fb --- /dev/null +++ b/scheduler/roles/slurm/tasks/add_node.yml @@ -0,0 +1,43 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Include slurm_conf tasks + ansible.builtin.include_tasks: _edit_conf.yml + when: '"slurm_control_node" in group_names' + +- name: Include common tasks + ansible.builtin.include_tasks: common.yml + when: '"slurm_node" in group_names' + +- name: Include compute installation tasks + ansible.builtin.include_tasks: compute.yml + when: '"slurm_node" in group_names' + +- name: Restart slurmctld + ansible.builtin.systemd_service: + name: slurmctld + state: restarted + when: restart_slurm_services and ('slurm_control_node' in group_names) + +- name: Assemble all slurm_nodes + ansible.builtin.debug: + var: all_nodes + +- name: Restart slurmd on all nodes + ansible.builtin.systemd_service: + name: slurmd + state: restarted + when: restart_slurm_services and ('slurm_control_node' in group_names) and (node_check.changed or partition_check.changed) and all_nodes is defined + delegate_to: "{{ item }}" + loop: "{{ all_nodes }}" \ No newline at end of file diff --git a/scheduler/roles/slurm/tasks/cleanall.yml b/scheduler/roles/slurm/tasks/cleanall.yml new file mode 100644 index 000000000..e46134549 --- /dev/null +++ b/scheduler/roles/slurm/tasks/cleanall.yml @@ -0,0 +1,64 @@ +--- +# tasks file for slurm +- name: Populate service facts + ansible.builtin.service_facts: + +- name: Stop Services + ansible.builtin.systemd_service: + name: "{{ item }}" + state: stopped + when: item in ansible_facts['services'].keys() + ignore_errors: true + loop: + - slurmctld.service + - slurmdbd.service + - slurmd.service + - munge.service + +- name: Remove share slurm path + ansible.builtin.file: + path: "{{ share_path }}/{{ slurm_dir_name }}" + state: absent + run_once: true + +- name: Remove files and folders + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - "/var/spool/state" + - "{{ slurm_config_dir }}" + - "/etc/munge/munge.key" + +- name: Remove packages + ansible.builtin.package: + name: "{{ item }}" + state: absent + loop: + - slurm-slurmctld + - slurm-slurmdbd + - slurm-slurmd + - munge + +- name: Remove slurm user + ansible.builtin.user: + name: slurm + state: absent + +- name: Remove slurm group + ansible.builtin.group: + name: slurm + state: absent + +- name: UnSet env variable + community.general.ini_file: + path: "/etc/environment" + option: SLURM_CONF + state: absent + mode: u=rw,g=r,o=r + +- name: Umounts + ansible.builtin.command: "umount /home" + +- name: Refresh mounts + ansible.builtin.command: "mount -a" \ No newline at end of file diff --git a/scheduler/roles/slurm/tasks/common.yml b/scheduler/roles/slurm/tasks/common.yml new file mode 100644 index 000000000..c8bd537d6 --- /dev/null +++ b/scheduler/roles/slurm/tasks/common.yml @@ -0,0 +1,48 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Sync time + ansible.builtin.include_tasks: time_sync.yml + +- name: Install common packages + ansible.builtin.package: + name: "{{ common_packages[ansible_os_family] }}" + state: present + +- name: Firewall service running + ansible.builtin.systemd_service: + name: firewalld + state: started + enabled: true + +- name: Create user and group + ansible.builtin.include_tasks: _user.yml + +- name: Create the slurm directory on share + ansible.builtin.file: + path: "{{ share_path }}/{{ slurm_dir_name }}" + state: directory + owner: root + group: root + mode: "{{ common_mode }}" + run_once: true + +- name: Install and configure Munge + ansible.builtin.include_tasks: _munge.yml + +- name: Slurm dict ops + ansible.builtin.set_fact: + slurm_conf_dict: "{{ __slurm_default_config | ansible.builtin.combine(slurm_config | default({})) }}" + cgroup_conf_dict: "{{ __cgroup_default_config | ansible.builtin.combine(cgroup | default({})) }}" + slurm_dbd_conf_dict: "{{ __slurm_dbd_default_config | ansible.builtin.combine(slurm_dbd_config | default({})) }}" diff --git a/scheduler/roles/slurm/tasks/compute.yml b/scheduler/roles/slurm/tasks/compute.yml new file mode 100644 index 000000000..6e253e83a --- /dev/null +++ b/scheduler/roles/slurm/tasks/compute.yml @@ -0,0 +1,57 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Install compute packages + ansible.builtin.package: + name: "{{ slurm_compute_packages[ansible_os_family] }}" + state: present + +- name: Enable SlurmdPort + ansible.posix.firewalld: + port: "{{ slurm_conf_dict['SlurmdPort'] }}/tcp" + permanent: true + state: enabled + immediate: true + +- name: Append share path if NFS + ansible.builtin.set_fact: + slurm_share_prefix: "{{ share_path }}/{{ slurm_dir_name }}" + +# file and directory permissions +- name: Ensure directories - {{ item }} + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: "{{ slurm_user }}" + group: "{{ slurm_user_group }}" + mode: "0755" + loop: + - "{{ slurm_share_prefix }}{{ slurm_config_dir }}" + - "{{ slurm_conf_dict['SlurmdSpoolDir'] }}" + # - "{{ slurm_conf_dict['SlurmdLogFile'] | ansible.builtin.dirname }}" + # - "{{ slurm_conf_dict['SlurmdPidFile'] | ansible.builtin.dirname}}" + +- name: Create files - {{ item }} + ansible.builtin.file: + path: "{{ item }}" + state: touch + owner: "{{ slurm_user }}" + group: "{{ slurm_user_group }}" + mode: "0644" + loop: + - "{{ slurm_conf_dict['SlurmdLogFile'] }}" + - "{{ slurm_conf_dict['SlurmdPidFile'] }}" + +- name: Configless settings + ansible.builtin.include_tasks: _configless.yml diff --git a/scheduler/roles/slurm/tasks/controller.yml b/scheduler/roles/slurm/tasks/controller.yml new file mode 100644 index 000000000..662ff42db --- /dev/null +++ b/scheduler/roles/slurm/tasks/controller.yml @@ -0,0 +1,66 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Install controller packages + ansible.builtin.package: + name: "{{ slurm_controller_packages[ansible_os_family] }}" + state: present + +- name: Create files - {{ item }} # TODO: file required? + ansible.builtin.file: + path: "{{ item }}" + state: touch + owner: "{{ slurm_user }}" + group: "{{ slurm_user_group }}" + mode: "{{ slurm_mode }}" + loop: + - "{{ slurm_conf_dict['SlurmctldLogFile'] }}" + - "{{ slurm_conf_dict['SlurmctldPidFile'] }}" + +# Port Config +- name: Enable SlurmctldPort + ansible.posix.firewalld: + port: "{{ slurm_conf_dict['SlurmctldPort'] }}/tcp" + permanent: true + state: enabled + immediate: true + +# Edit service +- name: Enable the service + ansible.builtin.service: + name: slurmctld + enabled: true + register: ctld_service + +- name: Edit the service + community.general.ini_file: + path: "{{ ctld_service['status']['FragmentPath'] | default('/usr/lib/systemd/system/slurmctld.service') }}" + section: Unit + option: ConditionPathExists + value: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" + no_extra_spaces: true + mode: u=rw,g=r,o=r + notify: + - Reload slurmctld + +- name: Edit the service to fetch specific conf + community.general.ini_file: + path: "{{ ctld_service['status']['FragmentPath'] | default('/usr/lib/systemd/system/slurmctld.service') }}" + section: Service + option: ExecStart + value: "/usr/sbin/slurmctld -D -f {{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf -s $SLURMCTLD_OPTIONS" + no_extra_spaces: true + mode: u=rw,g=r,o=r + notify: + - Reload slurmctld diff --git a/scheduler/roles/slurm/tasks/db.yml b/scheduler/roles/slurm/tasks/db.yml new file mode 100644 index 000000000..236ae74fb --- /dev/null +++ b/scheduler/roles/slurm/tasks/db.yml @@ -0,0 +1,129 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Is slurm_db_username defined + ansible.builtin.set_fact: + slurm_db_username: "{{ slurm_db_username_default }}" + slurm_db_password: "{{ slurm_db_password_default }}" + when: (slurm_db_username is not defined) or (not slurm_db_username) + +- name: Is slurm_db_password defined + ansible.builtin.set_fact: + slurm_db_password: "{{ slurm_db_password_default }}" + when: (slurm_db_password is not defined) + +- name: Install db packages + ansible.builtin.package: + name: "{{ db_packages[ansible_os_family][slurm_db_type] }}" + state: present + register: new_install + +- name: Create my.cnf.d file + ansible.builtin.lineinfile: + path: /etc/my.cnf + line: "[client-server]" + create: yes + +- name: Include my.cnf.d directory + ansible.builtin.lineinfile: + path: /etc/my.cnf + insertafter: '^\[client-server\]' + line: '!includedir /etc/my.cnf.d' + +- name: Set db port + ansible.builtin.lineinfile: + path: "{{ slurm_mysql_cnf_path }}" + regexp: '^port=.*' + insertafter: '^[mysqld]' + line: "port={{ slurm_db_port }}" + when: slurm_db_type == 'mysql' + notify: + - Restart mysqld + - Restart slurmctld + - Restart slurmdbd + +- name: Set db port + ansible.builtin.lineinfile: + path: "{{ slurm_mariadb_cnf_path }}" + regexp: '^port=.*' + insertafter: '^[mysqld]' + line: "port={{ slurm_db_port }}" + when: slurm_db_type == 'mariadb' + notify: + - Restart mariadb + - Restart slurmctld + - Restart slurmdbd + +- name: Enable db port + ansible.posix.firewalld: + port: "{{ slurm_db_port }}/tcp" + permanent: true + state: enabled + immediate: true + +- name: Enable and start the mysql service + ansible.builtin.service: + name: mysqld + enabled: true + state: started + register: ctld_service + when: slurm_db_type == 'mysql' + +- name: Enable and start the mariadb service + ansible.builtin.service: + name: mariadb + enabled: true + state: started + register: ctld_service + when: slurm_db_type == 'mariadb' + +- name: Add ansible db user + community.mysql.mysql_user: + login_user: "{{ slurm_db_username_default }}" + login_password: "{{ slurm_db_password_default }}" + name: "{{ slurm_db_username }}" + password: "{{ slurm_db_password }}" + host: "localhost" + priv: '*.*:ALL,GRANT' + login_unix_socket: "{{ slurm_db_login_unix_socket }}" + state: present + when: new_install.changed + +- name: Add db user - shortname + community.mysql.mysql_user: + login_user: "{{ slurm_db_username }}" + login_password: "{{ slurm_db_password }}" + name: "{{ slurm_dbd_db_username }}" + password: "{{ slurm_dbd_db_password }}" + host: "{{ groups['slurm_dbd'][0] }}" + priv: "{{ slurm_dbd_conf_dict['StorageLoc'] }}.*:ALL,GRANT" + login_unix_socket: "{{ slurm_db_login_unix_socket }}" + state: present + notify: + - Restart slurmctld + - Restart slurmdbd + +- name: Add db user - FQDN + community.mysql.mysql_user: + login_user: "{{ slurm_db_username }}" + login_password: "{{ slurm_db_password }}" + name: "{{ slurm_dbd_db_username }}" + password: "{{ slurm_dbd_db_password }}" + host: "{{ groups['slurm_dbd'][0] }}.%" + priv: "{{ slurm_dbd_conf_dict['StorageLoc'] }}.*:ALL,GRANT" + login_unix_socket: "{{ slurm_db_login_unix_socket }}" + state: present + notify: + - Restart slurmctld + - Restart slurmdbd \ No newline at end of file diff --git a/scheduler/roles/slurm/tasks/dbd.yml b/scheduler/roles/slurm/tasks/dbd.yml new file mode 100644 index 000000000..e7abfc0f1 --- /dev/null +++ b/scheduler/roles/slurm/tasks/dbd.yml @@ -0,0 +1,102 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Is slurm_db_host defined + ansible.builtin.set_fact: + slurm_db_host: "{{ groups['slurm_dbd'][0] }}" + when: (slurm_db_host is not defined) or (not slurm_db_host) + +- name: Is slurm_db_port defined + ansible.builtin.set_fact: + slurm_db_port: "{{ slurm_db_port_default }}" + when: (slurm_db_port is not defined) or (not slurm_db_port) + +- name: Is slurm_db_type defined + ansible.builtin.set_fact: + slurm_db_type: "{{ slurm_db_type_default }}" + when: (slurm_db_type is not defined) or (not slurm_db_type) + +- name: Create DB tasks + ansible.builtin.include_tasks: db.yml + args: + apply: + delegate_to: "{{ slurm_db_host }}" + run_once: true + +- name: Install DBD packages + ansible.builtin.package: + name: "{{ slurm_dbd_packages[ansible_os_family] }}" + state: present + +- name: Create files - {{ item }} # TODO: file required? + ansible.builtin.file: + path: "{{ item }}" + state: touch + owner: "{{ slurm_user }}" + group: "{{ slurm_user_group }}" + mode: "{{ slurm_mode }}" + loop: + - "{{ slurm_dbd_conf_dict['LogFile'] }}" + - "{{ slurm_dbd_conf_dict['PidFile'] }}" + +- name: Create slurmdbd.conf + ansible.builtin.template: # Create dest directory before + src: "dbd.conf.j2" + dest: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurmdbd.conf" + owner: "{{ slurm_user }}" + group: "{{ slurm_user }}" + mode: "{{ slurm_dbd_mode }}" + run_once: true + notify: + - Restart slurmctld + - Restart slurmdbd + +- name: Local copy slurmdbd.conf /etc/slurm/ + ansible.builtin.template: # Local copy #FAILSAFE + src: "dbd.conf.j2" + dest: "{{ slurm_config_dir }}/slurmdbd.conf" + owner: "{{ slurm_user }}" + group: "{{ slurm_user }}" + mode: "{{ slurm_dbd_mode }}" + when: '"slurm_dbd" in group_names' + ignore_errors: true + notify: + - Restart slurmdbd + - Restart slurmctld + +# Port Config +- name: Enable SlurmdbdPort + ansible.posix.firewalld: + port: "{{ slurm_dbd_port }}/tcp" + permanent: true + state: enabled + immediate: true + +# Edit service +- name: Enable the service + ansible.builtin.service: + name: slurmdbd + enabled: true + register: ctld_service + +- name: Edit the service + community.general.ini_file: + path: "{{ ctld_service['status']['FragmentPath'] | default('/usr/lib/systemd/system/slurmdbd.service') }}" + section: Unit + option: ConditionPathExists + value: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurmdbd.conf" + no_extra_spaces: true + notify: + - Reload slurmdbd + - Restart slurmdbd \ No newline at end of file diff --git a/scheduler/roles/slurm/tasks/login.yml b/scheduler/roles/slurm/tasks/login.yml new file mode 100644 index 000000000..3efbbe710 --- /dev/null +++ b/scheduler/roles/slurm/tasks/login.yml @@ -0,0 +1,30 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Install login packages + ansible.builtin.package: + name: "{{ slurm_login_packages[ansible_os_family] }}" + state: present + +- name: Append share path + ansible.builtin.set_fact: + slurm_share_prefix: "{{ share_path }}/{{ slurm_dir_name }}" + +- name: Set env variable + community.general.ini_file: + path: "/etc/environment" + option: SLURM_CONF + value: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" + no_extra_spaces: true + mode: u=rw,g=r,o=r diff --git a/scheduler/roles/slurm/tasks/main.yml b/scheduler/roles/slurm/tasks/main.yml new file mode 100644 index 000000000..5aae2a3f9 --- /dev/null +++ b/scheduler/roles/slurm/tasks/main.yml @@ -0,0 +1,39 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Validate inventory + ansible.builtin.include_tasks: validation.yml + run_once: true + +- name: Include common tasks + ansible.builtin.include_tasks: common.yml + +- name: Install configs on all + ansible.builtin.include_tasks: _config_files.yml + +- name: Include controller installation tasks + ansible.builtin.include_tasks: controller.yml + when: '"slurm_control_node" in group_names' + +- name: Include compute installation tasks + ansible.builtin.include_tasks: compute.yml + when: '"slurm_node" in group_names' + +- name: Include slurm DBD installation tasks + ansible.builtin.include_tasks: dbd.yml + when: '"slurm_dbd" in group_names' + +- name: Include login node installation tasks + ansible.builtin.include_tasks: login.yml + when: '"login" in group_names' diff --git a/scheduler/roles/slurm/tasks/rm_node.yml b/scheduler/roles/slurm/tasks/rm_node.yml new file mode 100644 index 000000000..35cf43765 --- /dev/null +++ b/scheduler/roles/slurm/tasks/rm_node.yml @@ -0,0 +1,35 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Include slurm_conf tasks + ansible.builtin.include_tasks: _rm_conf.yml + when: '"slurm_control_node" in group_names' + +- name: Restart slurmctld + ansible.builtin.systemd_service: + name: slurmctld + state: restarted + when: restart_slurm_services and ('slurm_control_node' in group_names) + +- name: Assemble all slurm_nodes + ansible.builtin.debug: + var: rem_nodes + +- name: Restart slurmd on remaining nodes + ansible.builtin.systemd_service: + name: slurmd + state: restarted + when: restart_slurm_services and ('slurm_control_node' in group_names) and (node_check.changed or partition_check.changed) and rem_nodes is defined + delegate_to: "{{ item }}" + loop: "{{ rem_nodes }}" diff --git a/scheduler/roles/slurm/tasks/time_sync.yml b/scheduler/roles/slurm/tasks/time_sync.yml new file mode 100644 index 000000000..c1c90c2d7 --- /dev/null +++ b/scheduler/roles/slurm/tasks/time_sync.yml @@ -0,0 +1,33 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Install chrony + ansible.builtin.package: + name: chrony + state: present + +- name: Ensure chrony dir + ansible.builtin.file: + path: "/etc/chrony" + state: directory + owner: root + group: root + mode: "0755" + +- name: Configure chrony to sync time with NTP servers + ansible.builtin.template: + src: chrony.conf.j2 + dest: /etc/chrony/chrony.conf + mode: "0755" + notify: Restart chrony diff --git a/scheduler/roles/slurm/tasks/validation.yml b/scheduler/roles/slurm/tasks/validation.yml new file mode 100644 index 000000000..65e3a71a4 --- /dev/null +++ b/scheduler/roles/slurm/tasks/validation.yml @@ -0,0 +1,39 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Slurm control and node group to contain at least 1 node + ansible.builtin.assert: + that: + - "'slurm_control_node' in groups" + - "'slurm_node' in groups" + - groups['slurm_control_node'] | length | int >= 1 + - groups['slurm_node'] | length | int >= 1 + fail_msg: "{{ inv_validation_fail_msg }}" + +- name: Control node not part of other groups + ansible.builtin.assert: + that: + - groups['slurm_control_node'] | intersect(groups['slurm_node']) | length | int == 0 + - groups['slurm_control_node'] | intersect(groups['login'] | default([])) | length | int == 0 + fail_msg: "{{ intersect_validation_fail_msg }}" + +- name: Check unreachable + ansible.builtin.assert: + that: + - groups['slurm_control_node'] | difference(ansible_play_hosts) | length | int == 0 + fail_msg: "All control nodes are not reachable." + +# TODO: Add validation for slurmdbd inputs + +# TODO: Add validation for db inputs diff --git a/scheduler/roles/slurm/templates/all_other.conf.j2 b/scheduler/roles/slurm/templates/all_other.conf.j2 new file mode 100644 index 000000000..a7c8036f4 --- /dev/null +++ b/scheduler/roles/slurm/templates/all_other.conf.j2 @@ -0,0 +1,7 @@ +{% set conf_dict = lookup('vars', item) %} +{% for key in conf_dict | sort %} +{% set val = conf_dict[key] %} +{% if val is not none and val != omit %} +{{ key }}={{ 'yes' if val is sameas true else ('no' if val is sameas false else val) }} +{% endif %} +{% endfor %} \ No newline at end of file diff --git a/scheduler/roles/slurm/templates/cgroup.conf.j2 b/scheduler/roles/slurm/templates/cgroup.conf.j2 new file mode 100644 index 000000000..7fcdfc403 --- /dev/null +++ b/scheduler/roles/slurm/templates/cgroup.conf.j2 @@ -0,0 +1,6 @@ +{% for key in cgroup_conf_dict | sort %} +{% set val = cgroup_conf_dict[key] %} +{% if val is not none and val != omit %} +{{ key }}={{ 'yes' if val is sameas true else ('no' if val is sameas false else val) }} +{% endif %} +{% endfor %} \ No newline at end of file diff --git a/scheduler/roles/slurm/templates/chrony.conf.j2 b/scheduler/roles/slurm/templates/chrony.conf.j2 new file mode 100644 index 000000000..317a5b50a --- /dev/null +++ b/scheduler/roles/slurm/templates/chrony.conf.j2 @@ -0,0 +1,41 @@ +# Use public servers from the pool.ntp.org project. +# Please consider joining the pool (http://www.pool.ntp.org/join.html). +{% for item in chrony_servers %} +pool {{ item }} iburst +{% endfor %} + + +# Record the rate at which the system clock gains/losses time. +driftfile /var/lib/chrony/drift + +# Allow the system clock to be stepped in the first three updates +# if its offset is larger than 1 second. +makestep 1.0 3 + +# Enable kernel synchronization of the real-time clock (RTC). +rtcsync + +# Enable hardware timestamping on all interfaces that support it. +#hwtimestamp * + +# Increase the minimum number of selectable sources required to adjust +# the system clock. +#minsources 2 + +# Allow NTP client access from local network. +#allow 192.168.0.0/16 + +# Serve time even if not synchronized to a time source. +#local stratum 10 + +# Specify file containing keys for NTP authentication. +keyfile /etc/chrony.keys + +# Get TAI-UTC offset and leap seconds from the system tz database. +leapsectz right/UTC + +# Specify directory for log files. +logdir /var/log/chrony + +# Select which information is logged. +#log measurements statistics tracking \ No newline at end of file diff --git a/scheduler/roles/slurm/templates/dbd.conf.j2 b/scheduler/roles/slurm/templates/dbd.conf.j2 new file mode 100644 index 000000000..900174ee0 --- /dev/null +++ b/scheduler/roles/slurm/templates/dbd.conf.j2 @@ -0,0 +1,21 @@ +# ADD DEFAULTS +{% for key in slurm_dbd_conf_dict | sort %} +{% set val = slurm_dbd_conf_dict[key] %} +{% if val is not none and val != omit %} +{{ key }}={{ 'yes' if val is sameas true else ('no' if val is sameas false else val) }} +{% endif %} +{% endfor %} + +{% if 'slurm_dbd' in groups %} +# USER DEFINED SLURM DBD INFO +DbdHost={{ groups['slurm_dbd'][0] }} +DbdPort={{ slurm_dbd_port }} +{% endif %} + +# DATABASE INFO +StorageHost={{ slurm_db_host }} +{% if slurm_db_port is not none and slurm_db_port != omit %} +StoragePort={{ slurm_db_port }} +{% endif %} +StorageUser={{ slurm_dbd_db_username }} +StoragePass={{ slurm_dbd_db_password }} \ No newline at end of file diff --git a/scheduler/roles/slurm/templates/slurm.conf.j2 b/scheduler/roles/slurm/templates/slurm.conf.j2 new file mode 100644 index 000000000..44f3368c8 --- /dev/null +++ b/scheduler/roles/slurm/templates/slurm.conf.j2 @@ -0,0 +1,40 @@ +ClusterName={{ cluster_name }} +{% for host in groups['slurm_control_node'] %} +SlurmctldHost={{ hostvars[host]['inventory_hostname_short'] }} +{% endfor %} +{% if slurm_installation_type == "configless" %} +{% set slurm_ctld_parameters = (slurm_ctld_parameters | default([]) ) + ['enable_configless'] %} +{% endif %} +{% if slurm_ctld_parameters | length > 0 %} +SlurmctldParameters={{ slurm_ctld_parameters | join(',') }} +{% endif %} +{% for key in slurm_conf_dict | sort %} +{% set val = slurm_conf_dict[key] %} +{% if val is not none and val != omit %} +{{ key }}={{ 'yes' if val is sameas true else ('no' if val is sameas false else val) }} +{% endif %} +{% endfor %} + +# SLURM DBD +{% if 'slurm_dbd' in groups %} +AccountingStorageHost={{ groups['slurm_dbd'][0] }} +AccountingStoragePort={{ slurm_dbd_port }} +AccountingStorageType=accounting_storage/slurmdbd +{% endif %} + +# COMPUTE NODES +NodeName=DEFAULT State=UNKNOWN +{% if 'slurm_node' in groups %} +{% for host in groups['slurm_node'] %} +NodeName={{ hostvars[host]['inventory_hostname_short'] }} CPUs={{ hostvars[host]['ansible_facts']['processor_nproc'] }} Sockets={{ hostvars[host]['ansible_facts']['processor_count'] }} CoresPerSocket={{ hostvars[host]['ansible_facts']['processor_cores'] }} ThreadsPerCore={{ hostvars[host]['ansible_facts']['processor_threads_per_core'] }} +{% endfor %} +{% endif %} + +# PARTITION INFO #TODO get inventory_hostname_short for looping +PartitionName=DEFAULT Nodes=ALL Default=YES MaxTime=INFINITE State=UP +{% if 'slurm_node' in groups %} +PartitionName={{ slurm_partition_name }} Nodes={{ groups['slurm_node'] | join(',') }} MaxTime=INFINITE State=UP +{% endif %} +{% for i in partitions %} +PartitionName={{ i.PartitionName }}{% for k in i | sort if k != 'PartitionName' %} {{ k }}={{ 'YES' if i[k] is sameas true else ('NO' if i[k] is sameas false else i[k]) }}{% endfor %} +{% endfor %} \ No newline at end of file diff --git a/scheduler/roles/slurm/vars/main.yml b/scheduler/roles/slurm/vars/main.yml new file mode 100644 index 000000000..fe908b454 --- /dev/null +++ b/scheduler/roles/slurm/vars/main.yml @@ -0,0 +1,75 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +chrony_servers: + - 2.centos.pool.ntp.org +cluster_name: cluster +slurm_uid: 6001 +slurm_user: slurm +slurm_user_group: slurm +share_path: "{{ share_mounted_path }}" +slurm_share_prefix: "" +slurm_config_dir: /etc/slurm +slurm_dir_name: slurm +slurm_dbd_port: 6819 +slurm_db_port_default: 3306 +slurm_db_type_default: mariadb +slurm_db_username_default: root +slurm_db_password_default: "" +slurm_dbd_db_username: "{{ slurm_user }}" +slurm_dbd_db_password: slurmPassword123! +slurm_db_login_unix_socket: /var/lib/mysql/mysql.sock +slurm_mysql_cnf_path: /etc/my.cnf.d/mysql-server.cnf +slurm_mariadb_cnf_path: /etc/my.cnf.d/mariadb-server.cnf +slurm_munge_key_path: "{{ share_path }}/{{ slurm_dir_name }}/munge.key" +slurm_partition_name: normal +slurm_ctld_parameters: [] +partitions: {} +_clean_before_install: false +conf_file_mode: "0644" +slurm_mode: "0644" +munge_mode: "0400" +common_mode: "0755" +slurm_dbd_mode: "0600" +common_packages: + RedHat: + - firewalld + - python3-firewall +munge_packages: + RedHat: + - munge +slurm_controller_packages: + RedHat: + - slurm-slurmctld +slurm_compute_packages: + RedHat: + - slurm-slurmd +slurm_login_packages: + RedHat: + - slurm +slurm_dbd_packages: + RedHat: + - slurm-slurmdbd +db_packages: + RedHat: + mysql: + - python3-PyMySQL + - mysql-server + mariadb: + - python3-PyMySQL + - mariadb-server + +# messages +inv_validation_fail_msg: "Failed. slurm_control_node and slurm_node groups should be defined in inventory and should have at least one node provided." +intersect_validation_fail_msg: "Failed. Node mentioned in slurm_control_node group should not be part of slurm_node or login group" diff --git a/scheduler/scheduler.yml b/scheduler/scheduler.yml index 042dec7d0..f8e3e4a75 100644 --- a/scheduler/scheduler.yml +++ b/scheduler/scheduler.yml @@ -17,7 +17,7 @@ when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) - name: Gather facts from all the nodes - hosts: slurm_control_node, kube_control_plane, slurm_node, kube_node, login, etcd + hosts: kube_control_plane, kube_node, etcd - name: Validate scheduler input parameters hosts: localhost @@ -33,13 +33,6 @@ ansible.builtin.import_playbook: ../utils/update_user_repo.yml when: not ( hostvars['127.0.0.1']['update_user_repo_executed'] | default(false) | bool ) -- name: Apply common Slurm installation and config - hosts: slurm_control_node, slurm_node, login - gather_facts: false - any_errors_fatal: true - roles: - - slurm_common - - name: Prepare kube control plane for kubernetes installations hosts: kube_control_plane gather_facts: false @@ -104,35 +97,25 @@ roles: - k8s_csi_powerscale_plugin -- name: Apply slurm control node config - hosts: slurm_control_node - gather_facts: false - roles: - - slurm_manager - -- name: Configure Slurm worker nodes - hosts: slurm_node, login - gather_facts: false - roles: - - slurm_workers - -- name: Start slurm services on slurm control node - hosts: slurm_control_node - gather_facts: false - roles: - - slurm_start_services - -- name: Start slurm services on slurm node and login - hosts: slurm_node, login - gather_facts: false - roles: - - slurm_workers_service - -- name: Setup slurm pam authentication - hosts: slurm_control_node, slurm_node, login - gather_facts: true +- name: Install Slurm + hosts: slurm_control_node, slurm_node, login, slurm_dbd + any_errors_fatal: true + vars: + share_mounted_path: "{{ hostvars['localhost']['share_path'] | default('/home') }}" # from storage.yml + #TODO: nfs_client role here, slurm depends on a mandatory share path + pre_tasks: + - name: Include input project directory + ansible.builtin.import_role: + name: ../utils/roles/include_input_dir + run_once: true + delegate_to: localhost + - name: Include vars omnia_config.yml + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/omnia_config.yml" + run_once: true + delegate_to: localhost roles: - - slurm_pam + - role: slurm - name: Compile and install the ucx and openmpi on the nfs share of compute nodes hosts: slurm_control_node, kube_control_plane diff --git a/tools/roles/pytorch/tasks/pytorch_verify.yml b/tools/roles/pytorch/tasks/pytorch_verify.yml index 291b4d639..9025f4489 100644 --- a/tools/roles/pytorch/tasks/pytorch_verify.yml +++ b/tools/roles/pytorch/tasks/pytorch_verify.yml @@ -97,7 +97,7 @@ - name: Set gaudi_image_run_cmd ansible.builtin.set_fact: gaudi_image_run_cmd: "nerdctl run -it --privileged -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice - --net=host --ipc=host -v /opt/omnia/:/workspace/ {{ pytorch_gaudi_image }} python /workspace/pytorch_example.py" + --net=host --ipc=host -v /opt/omnia/:/workspace/ {{ pytorch_gaudi_image }} python3 /workspace/pytorch_example.py" - name: Run gaudi container with example file ansible.builtin.command: "{{ gaudi_image_run_cmd }}"