-
Notifications
You must be signed in to change notification settings - Fork 128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Slurm changes for omnia 2.0 #2479
base: pub/new_architecture
Are you sure you want to change the base?
Changes from 8 commits
896a6c2
f01658e
977f880
80ff376
f9244a8
4680b9e
d8f04d9
05e8b27
3ba242b
6136805
b414b25
b673280
2e2a860
55e1646
66fad29
4ec9c79
a2a0be7
6fd0a03
9a28e22
a2e6c8c
9e00ce9
c16d6d2
fe9355f
9e70e82
9fb669a
7d433fe
358ecc2
bf73c31
a30ec2c
e387f57
5376453
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,18 @@ ansible_config_file_path: "/etc/ansible" | |
# The password must not contain -,\, '," | ||
mariadb_password: "password" | ||
|
||
# Host and port of the db. | ||
db_host: hn2 | ||
db_port: 3306 | ||
|
||
# Username and password that the slurmdbd user will use | ||
# to access the db. | ||
db_password: password | ||
db_username: slurm | ||
|
||
# Type of db to be used. Options are mysql or mariadb. | ||
db_type: mysql | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let mariadb be the default There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added |
||
|
||
# This variable accepts whether slurm installation is supported in configless mode or slurm in nfs | ||
# Default value is "configless" | ||
# If the value is "nfs_share", then share_path has to be mentioned | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
--- | ||
# defaults file for slurm | ||
mpi: {} | ||
cgroup: {} | ||
__cgroup_default_config: | ||
CgroupPlugin: autodetect | ||
ConstrainCores: true | ||
ConstrainDevices: true | ||
ConstrainRAMSpace: true | ||
ConstrainSwapSpace: true | ||
__slurm_default_config: | ||
SlurmUser: "{{ slurm_user }}" | ||
SlurmctldPort: 6817 | ||
SlurmdPort: 6818 | ||
SrunPortRange: "60001-63000" | ||
StateSaveLocation: "/var/spool/state" | ||
SlurmdSpoolDir: "/var/spool/slurmd" | ||
ReturnToService: 2 | ||
SchedulerType: sched/backfill | ||
MpiDefault: None | ||
ProctrackType: proctrack/cgroup | ||
SelectType: select/linear | ||
SlurmctldLogFile: /var/log/slurmctld.log | ||
SlurmdLogFile: /var/log/slurmd.log | ||
SlurmctldPidFile: /var/run/slurmctld.pid | ||
SlurmdPidFile: /var/run/slurmd.pid | ||
AuthType: auth/munge | ||
CryptoType: crypto/munge | ||
SlurmctldTimeout: 120 | ||
SlurmdTimeout: 300 | ||
__slurm_dbd_default_config: | ||
AuthType: auth/munge | ||
LogFile: /var/log/slurmdbd.log | ||
PidFile: /var/run/slurmdbd.pid | ||
SlurmUser: "{{ slurm_user }}" | ||
StorageType: accounting_storage/mysql |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
--- | ||
# handlers file for slurm | ||
- name: Restart chrony | ||
ansible.builtin.systemd_service: | ||
name: "{{ 'chronyd' if ansible_os_family == 'RedHat' else 'chrony' }}" | ||
state: restarted | ||
enabled: true | ||
|
||
- name: Restart munge | ||
ansible.builtin.systemd_service: | ||
name: munge | ||
state: restarted | ||
|
||
- name: Restart mysqld | ||
ansible.builtin.systemd_service: | ||
name: mysqld | ||
state: restarted | ||
|
||
- name: Restart mariadb | ||
ansible.builtin.systemd_service: | ||
name: mariadb | ||
state: restarted | ||
|
||
- name: Reload slurmdbd | ||
ansible.builtin.systemd_service: | ||
name: slurmdbd | ||
state: reloaded | ||
daemon_reload: true | ||
enabled: true | ||
when: restart_slurm_services and ('slurm_dbd' in group_names) | ||
|
||
- name: Restart slurmdbd | ||
ansible.builtin.systemd_service: | ||
name: slurmdbd | ||
state: restarted | ||
when: restart_slurm_services and ('slurm_dbd' in group_names) | ||
|
||
- name: Reload slurmctld | ||
ansible.builtin.systemd_service: | ||
name: slurmctld | ||
state: reloaded | ||
daemon_reload: true | ||
enabled: true | ||
when: restart_slurm_services and ('slurm_control_node' in group_names) | ||
|
||
- name: Restart slurmctld | ||
ansible.builtin.systemd_service: | ||
name: slurmctld | ||
state: restarted | ||
when: restart_slurm_services and ('slurm_control_node' in group_names) | ||
|
||
- name: Reload slurmd | ||
ansible.builtin.systemd_service: | ||
name: slurmd | ||
state: reloaded | ||
daemon_reload: true | ||
enabled: true | ||
when: restart_slurm_services and ('slurm_node' in group_names) | ||
|
||
- name: Restart slurmd | ||
ansible.builtin.systemd_service: | ||
name: slurmd | ||
state: restarted | ||
when: restart_slurm_services and ('slurm_node' in group_names) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
--- | ||
galaxy_info: | ||
author: Jagadeesh N V | ||
description: Omnia | ||
company: Dell | ||
|
||
# If the issue tracker for your role is not on github, uncomment the | ||
# next line and provide a value | ||
# issue_tracker_url: http://example.com/issue/tracker | ||
|
||
# Choose a valid license ID from https://spdx.org - some suggested licenses: | ||
# - BSD-3-Clause (default) | ||
# - MIT | ||
# - GPL-2.0-or-later | ||
# - GPL-3.0-only | ||
# - Apache-2.0 | ||
# - CC-BY-4.0 | ||
license: Apache-2.0 | ||
|
||
min_ansible_version: 2.1 | ||
|
||
# If this a Container Enabled role, provide the minimum Ansible Container version. | ||
# min_ansible_container_version: | ||
|
||
# | ||
# Provide a list of supported platforms, and for each platform a list of versions. | ||
# If you don't wish to enumerate all versions for a particular platform, use 'all'. | ||
# To view available platforms and versions (or releases), visit: | ||
# https://galaxy.ansible.com/api/v1/platforms/ | ||
# | ||
# platforms: | ||
# - name: Fedora | ||
# versions: | ||
# - all | ||
# - 25 | ||
# - name: SomePlatform | ||
# versions: | ||
# - all | ||
# - 1.0 | ||
# - 7 | ||
# - 99.99 | ||
|
||
galaxy_tags: [] | ||
# List tags for your role here, one per line. A tag is a keyword that describes | ||
# and categorizes the role. Users find roles by searching for tags. Be sure to | ||
# remove the '[]' above, if you add tags to this list. | ||
# | ||
# NOTE: A tag is limited to a single word comprised of alphanumeric characters. | ||
# Maximum 20 tags per role. | ||
|
||
dependencies: [] | ||
# List your role dependencies here, one per line. Be sure to remove the '[]' above, | ||
# if you add dependencies to this list. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
--- | ||
- name: Slurm dict ops | ||
ansible.builtin.set_fact: | ||
slurm_conf_dict: "{{ __slurm_default_config | ansible.builtin.combine(slurm_config | default({})) }}" | ||
cgroup_conf_dict: "{{ __cgroup_default_config | ansible.builtin.combine(cgroup | default({})) }}" | ||
slurm_dbd_conf_dict: "{{ __slurm_dbd_default_config | ansible.builtin.combine(slurm_dbd_config | default({})) }}" | ||
|
||
- name: Append share path if NFS | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ensure copyright in all files |
||
ansible.builtin.set_fact: | ||
slurm_share_prefix: "{{ share_path }}/{{ slurm_dir_name }}" | ||
|
||
- name: Ensure directories - {{ item }} | ||
ansible.builtin.file: | ||
path: "{{ item }}" | ||
state: directory | ||
owner: "{{ slurm_user }}" | ||
group: "{{ slurm_user_group }}" | ||
mode: "{{ common_mode }}" | ||
loop: | ||
- "{{ slurm_share_prefix }}{{ slurm_config_dir }}" | ||
- "{{ slurm_conf_dict['StateSaveLocation'] }}" | ||
- "{{ slurm_config_dir }}" | ||
|
||
# TODO: Can't use run_once and when condition on same task, | ||
# maybe should move to seperate file with a when condtion for inlcude task | ||
- name: Create slurmdbd.conf | ||
ansible.builtin.template: # Create dest directory before | ||
src: "dbd.conf.j2" | ||
dest: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurmdbd.conf" | ||
owner: "{{ slurm_user }}" | ||
group: "{{ slurm_user }}" | ||
mode: "{{ slurm_dbd_mode }}" | ||
# run_once: true | ||
when: '"slurm_dbd" in group_names' | ||
notify: | ||
- Restart slurmctld | ||
- Restart slurmdbd | ||
- Restart slurmd | ||
|
||
- name: Create slurm.conf | ||
ansible.builtin.template: # Create dest directory before | ||
src: "slurm.conf.j2" | ||
dest: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" | ||
owner: root | ||
group: root | ||
mode: "{{ conf_file_mode }}" | ||
run_once: true | ||
notify: | ||
- Restart slurmctld | ||
|
||
- name: Create cgroup.conf | ||
ansible.builtin.template: # Create dest directory before | ||
src: "cgroup.conf.j2" | ||
dest: "{{ item }}" | ||
owner: root | ||
group: root | ||
mode: "{{ conf_file_mode }}" | ||
loop: | ||
- "{{ slurm_share_prefix }}{{ slurm_config_dir }}/cgroup.conf" | ||
- "{{ slurm_config_dir }}/cgroup.conf" | ||
notify: | ||
- Restart slurmctld | ||
- Restart slurmdbd | ||
|
||
# Environment Var SLURM_CONF | ||
- name: Set env variable | ||
community.general.ini_file: | ||
path: "/etc/environment" | ||
option: SLURM_CONF | ||
value: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" | ||
no_extra_spaces: true | ||
mode: u=rw,g=r,o=r | ||
|
||
- name: Local copy slurmdbd.conf /etc/slurm/conf # parallel writes | ||
ansible.builtin.template: # Local copy #FAILSAFE | ||
src: "dbd.conf.j2" | ||
dest: "{{ slurm_config_dir }}/slurmdbd.conf" | ||
owner: "{{ slurm_user }}" | ||
group: "{{ slurm_user }}" | ||
mode: "{{ slurm_dbd_mode }}" | ||
ignore_errors: true | ||
notify: | ||
- Restart slurmctld | ||
- Restart slurmdbd | ||
|
||
- name: All other host configs # TODO: failsafe | ||
ansible.builtin.template: | ||
src: "all_other.conf.j2" | ||
dest: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/{{ item }}.conf" | ||
mode: "{{ conf_file_mode }}" | ||
backup: true | ||
loop: | ||
- mpi | ||
- acct_gather | ||
- gres | ||
when: item in vars | ||
notify: | ||
- Reload slurmctld | ||
- Reload slurmd | ||
|
||
- name: Local copy slurm.conf /etc/slurm/conf # parallel writes | ||
ansible.builtin.template: # Local copy # FAILSAFE | ||
src: "{{ item }}.conf.j2" | ||
dest: "{{ slurm_config_dir }}/{{ item }}.conf" | ||
owner: root | ||
group: root | ||
mode: "{{ conf_file_mode }}" | ||
ignore_errors: true | ||
register: ignore_errors_register | ||
loop: | ||
- slurm | ||
- cgroup | ||
notify: | ||
- Restart slurmctld | ||
|
||
- name: All other host local configs # FAILSAFE | ||
ansible.builtin.template: | ||
src: "all_other.conf.j2" | ||
dest: "{{ slurm_config_dir }}/{{ item }}.conf" | ||
mode: "{{ conf_file_mode }}" | ||
backup: true | ||
ignore_errors: true | ||
register: ignore_errors_register | ||
loop: | ||
- mpi | ||
- acct_gather | ||
- gres | ||
when: item in vars | ||
notify: | ||
- Reload slurmctld | ||
- Reload slurmd |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
--- | ||
- name: Enable the service | ||
ansible.builtin.service: | ||
name: slurmd | ||
enabled: true | ||
register: d_service | ||
|
||
- name: Get service path | ||
ansible.builtin.set_fact: | ||
slurmd_service_path: "{{ d_service['status']['FragmentPath'] | default('/usr/lib/systemd/system/slurmd.service') }}" | ||
|
||
- name: Edit the service !ConditionPathExists | ||
community.general.ini_file: | ||
path: "{{ slurmd_service_path }}" | ||
section: Unit | ||
option: ConditionPathExists | ||
state: absent | ||
mode: u=rw,g=r,o=r | ||
when: slurm_installation_type == 'configless' | ||
notify: | ||
- Reload slurmd | ||
- Restart slurmd | ||
|
||
- name: Edit the service ExecStart | ||
community.general.ini_file: | ||
path: "{{ slurmd_service_path }}" | ||
section: Service | ||
option: ExecStart | ||
value: "/usr/sbin/slurmd -D -s $SLURMD_OPTIONS | ||
--conf-server {{ (hostvars[groups['slurm_control_node'] | first])['inventory_hostname_short'] | ||
+ ':' + (slurm_conf_dict['SlurmctldPort'] | ||
| string) }}" # HA | ||
no_extra_spaces: true | ||
mode: u=rw,g=r,o=r | ||
when: slurm_installation_type == 'configless' | ||
notify: | ||
- Reload slurmd | ||
- Restart slurmd | ||
|
||
# Environment Var SLURM_CONF | ||
- name: Set env variable | ||
community.general.ini_file: | ||
path: "/etc/environment" | ||
option: SLURM_CONF | ||
state: absent | ||
mode: u=rw,g=r,o=r | ||
when: slurm_installation_type == 'configless' | ||
|
||
- name: Edit the service ConditionPathExists | ||
community.general.ini_file: | ||
path: "{{ slurmd_service_path }}" | ||
section: Unit | ||
option: ConditionPathExists | ||
value: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" | ||
no_extra_spaces: true | ||
mode: u=rw,g=r,o=r | ||
when: slurm_installation_type != 'configless' | ||
notify: | ||
- Reload slurmd | ||
- Restart slurmd | ||
|
||
# Environment Var SLURM_CONF | ||
- name: Set env variable | ||
community.general.ini_file: | ||
path: "/etc/environment" | ||
option: SLURM_CONF | ||
value: "{{ slurm_share_prefix }}{{ slurm_config_dir }}/slurm.conf" | ||
no_extra_spaces: true | ||
mode: u=rw,g=r,o=r | ||
when: slurm_installation_type != 'configless' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Cypher-Miller keep these(all db_) vars blank and commented. "hn2" :)