Skip to content

Commit

Permalink
Merge branch 'main' into fix/ood-defaults
Browse files Browse the repository at this point in the history
  • Loading branch information
sjpb committed Sep 28, 2022
2 parents 75cea09 + 8ca25e8 commit cc24655
Show file tree
Hide file tree
Showing 23 changed files with 69 additions and 40 deletions.
2 changes: 1 addition & 1 deletion ansible/adhoc/hpctests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

---

- hosts: hpctests[0] # TODO: might want to make which node is used selectable?
- hosts: login[0] # TODO: might want to make which node is used selectable?
become: false
gather_facts: false
tasks:
Expand Down
14 changes: 14 additions & 0 deletions ansible/bootstrap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,20 @@
- import_role:
name: fail2ban

- name: Setup podman
hosts: podman
tags: podman
tasks:
- import_role:
name: podman
tasks_from: prereqs.yml
tags: prereqs

- import_role:
name: podman
tasks_from: config.yml
tags: config

- hosts: update
gather_facts: false
become: yes
Expand Down
4 changes: 2 additions & 2 deletions ansible/ci/test_reimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@
gather_facts: no
tags: reimage_compute
tasks:
# TODO: This is specific to smslabs/arcus environment config - could generalise to all compute nodes
# TODO: This is specific to arcus environment config - could generalise to all compute nodes
- name: Request compute node rebuild via Slurm
shell:
cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1]
cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-3]
become: yes

- name: Check compute node rebuild completed
Expand Down
14 changes: 0 additions & 14 deletions ansible/monitoring.yml
Original file line number Diff line number Diff line change
@@ -1,20 +1,6 @@
# ---
# # NOTE: Requires slurmdbd

- name: Setup podman
hosts: podman
tags: podman
tasks:
- import_role:
name: podman
tasks_from: prereqs.yml
tags: prereqs

- import_role:
name: podman
tasks_from: config.yml
tags: config

- name: Setup elasticsearch
hosts: opendistro
tags: opendistro
Expand Down
3 changes: 2 additions & 1 deletion ansible/roles/hpctests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ Role Variables
--------------

- `hpctests_rootdir`: Required. Path to root of test directory tree, which must be on a r/w filesystem shared to all cluster nodes under test. The last directory component will be created.
- `hpctests_nodes`: Optional. A Slurm node expression, e.g. `'compute-[0-15,19]'` defining the nodes to use. If not set all nodes in the default partition are used. Note nodes selected **must** be in the default partition.
- `hpctests_partition`: Optional. Name of partition to use, otherwise default partition is used.
- `hpctests_nodes`: Optional. A Slurm node expression, e.g. `'compute-[0-15,19]'` defining the nodes to use. If not set all nodes in the selected partition are used.
- `hpctests_ucx_net_devices`: Optional. Control which network device/interface to use, e.g. `mlx5_1:0`. The default of `all` (as per UCX) may not be appropriate for multi-rail nodes with different bandwidths on each device. See [here](https://openucx.readthedocs.io/en/master/faq.html#what-is-the-default-behavior-in-a-multi-rail-environment) and [here](https://github.com/openucx/ucx/wiki/UCX-environment-parameters#setting-the-devices-to-use).
- `hpctests_outdir`: Optional. Directory to use for test output on local host. Defaults to `$HOME/hpctests` (for local user).
- `hpctests_hpl_NB`: Optional, default 192. The HPL block size "NB" - for Intel CPUs see [here](https://software.intel.com/content/www/us/en/develop/documentation/onemkl-linux-developer-guide/top/intel-oneapi-math-kernel-library-benchmarks/intel-distribution-for-linpack-benchmark/configuring-parameters.html).
Expand Down
1 change: 1 addition & 0 deletions ansible/roles/hpctests/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ hpctests_hpl_NB: 192
hpctests_hpl_mem_frac: 0.8
hpctests_hpl_arch: linux64
#hpctests_nodes:
#hpctests_partition:
3 changes: 1 addition & 2 deletions ansible/roles/hpctests/library/slurm_node_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
options
nodes:
description:
- Slurm nodenames for which information is required. These must be homogenous.
- Slurm nodenames for which information is required.
required: true
type: list
requirements:
Expand Down Expand Up @@ -56,7 +56,6 @@ def run_module():
print(values)
for ix, param in enumerate(params):
info[param] = [nodeinfo[ix].strip() for nodeinfo in values if nodeinfo[nodelist_ix].strip() in module.params['nodes']]
# info[param] = [nodeinfo[nodelist_ix] for nodeinfo in values]
result['info'] = info

module.exit_json(**result)
Expand Down
14 changes: 10 additions & 4 deletions ansible/roles/hpctests/tasks/hpl-solo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
- debug:
msg: "Using {{ hpctests_hplsolo_ntasks }} process per node with P={{ hpctests_hplsolo_pq.grid.P }}, Q={{ hpctests_hplsolo_pq.grid.Q }} targeting {{ (hpctests_hpl_mem_frac | float) * 100 }}% of {{ hpctests_nodeinfo.info['MEMORY'][0] }} MB memory per node, block size (NB) = {{ hpctests_hpl_NB }}, problem size (N) = {{ hpctests_hplsolo_N }}"

- name: Get all nodes
shell: "sinfo --Node --noheader --format %N" # TODO: assumes only one partition, although actually excluding nodes not in the default partition should be fine.
- name: Get all nodes in partition
shell: "sinfo --Node --noheader --format %N --partition={{ hpctests_partition }}"
register: all_nodes
changed_when: false

Expand Down Expand Up @@ -74,6 +74,11 @@
vars:
hpctests_hplsolo_ntasks: 2 # TODO: FIXME

- name: Remove previous outputs
# As depending on the number of nodes there will be different numbers of output files for different partitions so won't all get overwritten
shell:
cmd: "rm -f {{ hpctests_rootdir }}/hpl-solo/hpl-solo.sh.*.out"

- name: Run hpl-solo
shell: sbatch --wait hpl-solo.sh
become: no
Expand Down Expand Up @@ -111,10 +116,11 @@
tags: postpro
debug:
msg: |
Summary for hpl-solo ({{ hpctests_computes.stdout_lines | length }} nodes) job {{ hpctests_hplsolo_sbatch.stdout.split()[-1] }} using {{ hpctests_ucx_net_devices }}:
Summary for hpl-solo on {{ hpctests_computes.stdout_lines | length }} nodes in '{{ hpctests_partition }}' partition, job ID {{ hpctests_hplsolo_sbatch.stdout.split()[-1] }}, device '{{ hpctests_ucx_net_devices }}':
Max: {{ perf.stdout_lines | map('float') | max }} gflops
Min: {{ perf.stdout_lines | map('float') | min }} gflops
Mean: {{ (perf.stdout_lines | map('float') | sum) / (hpctests_computes.stdout_lines | length) }} gflops
Mean: {{ (perf.stdout_lines | map('float') | sum) / (hpctests_computes.stdout_lines | length) }} gflops
Individual node results (gflops):
{{ dict(hpctests_computes.stdout_lines | zip(perf.stdout_lines | map('float') )) | to_nice_yaml }}
4 changes: 3 additions & 1 deletion ansible/roles/hpctests/tasks/pingmatrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
- name: Summarise results
debug:
msg: |
Summary for pingmatrix (pairwise on {{ slurm_names.stdout_lines | length }} nodes) job {{ hpctests_pingmatrix_sbatch.stdout.split()[-1] }} using {{ hpctests_ucx_net_devices }}:
Summary for pingmatrix pairwise over {{ slurm_names.stdout_lines | length }} nodes in '{{ hpctests_partition }}' partition, job ID {{ hpctests_pingmatrix_sbatch.stdout.split()[-1] }}, device '{{ hpctests_ucx_net_devices }}':
{{ nxnlatbw['stats'] | to_nice_yaml }}
Tabular output on ansible control host at {{ hpctests_outdir }}/pingmatrix.html
9 changes: 5 additions & 4 deletions ansible/roles/hpctests/tasks/pingpong.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,11 @@

- debug:
msg: |
Summary for pingpong (2x scheduler-selected nodes) job {{ _pingpong_jobid }} (using interface {{ hpctests_ucx_net_devices }}):
nodes: {{ hpctests_pingpong_run_nodes.stdout.split()[1] }}
zero-size msg latency: {{ hpctests_pingpong_out['columns']['latency'][0] }} us
max bandwidth: {{ hpctests_pingpong_out['columns']['bandwidth'] | max }} Mbytes/s ({{ (hpctests_pingpong_out['columns']['bandwidth'] | max) / 125.0 }} Gbit/s)
Summary for pingpong using 2x scheduler-selected nodes in '{{ hpctests_partition }}' partition, job ID {{ _pingpong_jobid }}, device '{{ hpctests_ucx_net_devices }}':
Nodes: {{ hpctests_pingpong_run_nodes.stdout.split()[1] }}
Zero-size msg latency: {{ hpctests_pingpong_out['columns']['latency'][0] }} us
Max bandwidth: {{ hpctests_pingpong_out['columns']['bandwidth'] | max }} Mbytes/s ({{ (hpctests_pingpong_out['columns']['bandwidth'] | max) / 125.0 }} Gbit/s)
See plot on localhost:
{{ _pingpong_plot.stdout }}
12 changes: 11 additions & 1 deletion ansible/roles/hpctests/tasks/setup.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
---

- name: Get partition information
shell: "sinfo --format %P --noheader"
register: _sinfo_partitions
changed_when: false

- name: Select default partition if hpctests_partition not given
set_fact:
hpctests_partition: "{{ _sinfo_partitions.stdout_lines | select('contains', '*') | first | trim('*') }}"
when: hpctests_partition is not defined

- name: Get info about compute nodes
shell: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --format %N"
shell: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --partition {{hpctests_partition}} --format %N"
register: hpctests_computes
changed_when: false
failed_when: hpctests_computes.rc != 0
Expand Down
1 change: 1 addition & 0 deletions ansible/roles/hpctests/templates/hpl-build.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#SBATCH --output=%x.%a.out
#SBATCH --error=%x.%a.out
#SBATCH --exclusive
#SBATCH --partition={{ hpctests_partition }}
{%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_computes.stdout_lines[0] }}{% endif %}

echo HPL arch: {{ hpctests_hpl_arch }}
Expand Down
1 change: 1 addition & 0 deletions ansible/roles/hpctests/templates/hpl-solo.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#SBATCH --error=%x.%a.out
#SBATCH --exclusive
#SBATCH --array=0-{{ hpctests_computes.stdout_lines | length - 1 }}
#SBATCH --partition={{ hpctests_partition }}
{% if hpctests_hplsolo_excluded_nodes | length > 0 %}
#SBATCH --exclude={{ hpctests_hplsolo_excluded_nodes | join(',') }}
{% endif %}
Expand Down
1 change: 1 addition & 0 deletions ansible/roles/hpctests/templates/pingmatrix.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#SBATCH --output=%x.out
#SBATCH --error=%x.out
#SBATCH --exclusive
#SBATCH --partition={{ hpctests_partition }}
{%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_nodes }}{% endif %}

export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }}
Expand Down
1 change: 1 addition & 0 deletions ansible/roles/hpctests/templates/pingpong.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#SBATCH --output=%x.out
#SBATCH --error=%x.out
#SBATCH --exclusive
#SBATCH --partition={{ hpctests_partition }}
{%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_nodes }}{% endif %}

export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }}
Expand Down
2 changes: 1 addition & 1 deletion ansible/roles/mysql/tasks/configure.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# no_log: true # TODO: FIXME
register: _mysql_info
until: "'version' in _mysql_info"
retries: 60
retries: 90
delay: 2

- name: Ensure mysql databases created
Expand Down
1 change: 1 addition & 0 deletions ansible/roles/mysql/templates/mysql.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Restart=always
EnvironmentFile=/etc/sysconfig/mysqld
# The above EnvironmentFile must define MYSQL_INITIAL_ROOT_PASSWORD
ExecStartPre=+install -d -o {{ mysql_podman_user }} -g {{ mysql_podman_user }} -Z container_file_t {{ mysql_datadir }}
ExecStartPre=+chown -R {{ mysql_podman_user }}:{{ mysql_podman_user }} {{ mysql_datadir }}
ExecStart=/usr/bin/podman run \
--network slirp4netns:cidr={{ podman_cidr }} \
--sdnotify=conmon --cgroups=no-conmon \
Expand Down
1 change: 1 addition & 0 deletions ansible/roles/opendistro/templates/opendistro.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ After=network-online.target
Environment=PODMAN_SYSTEMD_UNIT=%n
Restart=always
ExecStartPre=+install -d -o {{ opendistro_podman_user }} -g {{ opendistro_podman_user }} -Z container_file_t {{ opendistro_data_path }}
ExecStartPre=+chown -R {{ opendistro_podman_user }}:{{ opendistro_podman_user }} {{ opendistro_data_path }}
ExecStart=/usr/bin/podman run \
--network slirp4netns:cidr={{ podman_cidr }} \
--sdnotify=conmon --cgroups=no-conmon \
Expand Down
3 changes: 2 additions & 1 deletion environments/arcus/hooks/check_slurm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@
<end>
vars:
expected_sinfo:
- "{{ openhpc_cluster_name }}-compute-[0-1] {{ openhpc_slurm_partitions[0].name }}* up 60-00:00:00 2 idle"
- "{{ openhpc_cluster_name }}-compute-[0-1] small* up 60-00:00:00 2 idle"
- "{{ openhpc_cluster_name }}-compute-[2-3] extra up 60-00:00:00 2 idle"
6 changes: 6 additions & 0 deletions environments/arcus/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,16 @@ module "cluster" {
flavor: "vm.alaska.cpu.general.small"
image: "openhpc-220830-2042.qcow2"
}
extra: {
flavor: "vm.alaska.cpu.general.small"
image: "openhpc-220830-2042.qcow2"
}
}
compute_nodes = {
compute-0: "small"
compute-1: "small"
compute-2: "extra"
compute-3: "extra"
}

environment_root = var.environment_root
Expand Down
3 changes: 0 additions & 3 deletions environments/common/inventory/groups
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@ login
control
compute

[hpctests]
# Login group to use for running mpi-based testing.

[cluster:children]
# All nodes in the appliance - add e.g. service nodes not running Slurm here.
openhpc
Expand Down
3 changes: 0 additions & 3 deletions environments/common/layouts/everything
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
[nfs:children]
openhpc

[hpctests:children]
login

[mysql:children]
control

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ ${compute.name} ansible_host=${[for n in compute.network: n.fixed_ip_v4 if n.acc
%{~ for type_name, type_descr in compute_types}
[${cluster_name}_${type_name}]
%{~ for node_name, node_type in compute_nodes ~}
%{~ if node_type == type_name }${cluster_name}-${node_name}%{ endif }
%{~ endfor ~}
%{~ if node_type == type_name ~}
${cluster_name}-${node_name}
%{~ endif ~}
%{~ endfor ~}
%{ endfor ~}

0 comments on commit cc24655

Please sign in to comment.