From bd2de50984f2f4048b7f7e97e153d9dda6b0268e Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 7 Aug 2024 15:27:13 +0100 Subject: [PATCH] Update hpctests to obey UCX_NET_DEVICES when RoCE devices present (#421) * Turn off higher priority MPI net devices * Update pingmatrix.sh.j2 * Update pingmatrix.sh.j2 * Update pingpong.sh.j2 * Replace j2 comments with bash * Update pingpong.sh.j2 --------- Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/roles/hpctests/templates/pingmatrix.sh.j2 | 6 +++++- ansible/roles/hpctests/templates/pingpong.sh.j2 | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 index d886e9ac8..990018d85 100644 --- a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 +++ b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 @@ -16,4 +16,8 @@ echo UCX_NET_DEVICES: $UCX_NET_DEVICES module load {{ hpctests_pingmatrix_modules | join(' ' ) }} mpicc -o nxnlatbw mpi_nxnlatbw.c -mpirun nxnlatbw + +# mpirun flags force using UCX TCP transports, overriding higher +# priority of OpenMPI btl/openib component, which is also using RDMA +# https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 +mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any nxnlatbw diff --git a/ansible/roles/hpctests/templates/pingpong.sh.j2 b/ansible/roles/hpctests/templates/pingpong.sh.j2 index 4dc2eebd5..dad4499b1 100644 --- a/ansible/roles/hpctests/templates/pingpong.sh.j2 +++ b/ansible/roles/hpctests/templates/pingpong.sh.j2 @@ -16,4 +16,8 @@ echo UCX_NET_DEVICES: $UCX_NET_DEVICES module load {{ hpctests_pingpong_modules | join(' ' ) }} #srun --mpi=pmi2 IMB-MPI1 pingpong # doesn't work in ohpc v2.1 -mpirun IMB-MPI1 pingpong + +# mpirun flags force using UCX TCP transports, overriding higher +# priority of OpenMPI btl/openib component, which is also using RDMA +# https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 +mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any IMB-MPI1 pingpong