Skip to content

Commit 37adf47

Browse files
nsarkaNicholas Sarkauskas
andauthored
CL_DOCA_UROM (openucx#978)
* CL/DOCA_UROM: Add CL_DOCA_UROM * CL/DOCA_UROM: Clean up * CL/DOCA_UROM: Remove extra code * CL/DOCA_UROM: Remove extra field * CL/DOCA_UROM: Remove extra field * CL/DOCA_UROM: Format plugin * CL/DOCA_UROM: Format CL, add comments * CL/DOCA_UROM: Change to alg name * CL/DOCA_UROM: Review feedback * CL/DOCA_UROM: More review feedback * CL/DOCA_UROM: Review feedback * CL/DOCA_UROM: Review feedback * CL/DOCA_UROM: Review feedback * CL/DOCA_UROM: Review feedback * CL/DOCA_UROM: review feedback * CL/DOCA_UROM: Review feedback --------- Co-authored-by: Nicholas Sarkauskas <[email protected]>
1 parent 36389f4 commit 37adf47

25 files changed

+6656
-10
lines changed

.github/workflows/codestyle.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ jobs:
3737
fi
3838
fi
3939
H1="CODESTYLE|REVIEW|CORE|UTIL|TEST|API|DOCS|TOOLS|BUILD|MC|EC|SCHEDULE|TOPO"
40-
H2="CI|CL/|TL/|MC/|EC/|UCP|SHM|NCCL|SHARP|BASIC|HIER|CUDA|CPU|EE|RCCL|ROCM|SELF|MLX5"
40+
H2="CI|CL/|TL/|MC/|EC/|UCP|SHM|NCCL|SHARP|BASIC|HIER|DOCA_UROM|CUDA|CPU|EE|RCCL|ROCM|SELF|MLX5"
4141
if ! echo $msg | grep -qP '^Merge |^'"(($H1)|($H2))"'+: \w'
4242
then
4343
echo "Wrong header"

Makefile.am

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
#
2-
# Copyright (c) 2020, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# Copyright (C) Huawei Technologies Co., Ltd. 2020. All rights reserved.
44
# $HEADER$
55
#
66

77
if !DOCS_ONLY
88
SUBDIRS = \
99
src \
10+
contrib \
1011
tools/info \
1112
cmake
1213

config/m4/doca_urom.m4

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#
2+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# See file LICENSE for terms.
4+
#
5+
6+
AC_DEFUN([CHECK_DOCA_UROM],[
7+
AS_IF([test "x$doca_urom_checked" != "xyes"],[
8+
doca_urom_happy="no"
9+
AC_ARG_WITH([doca_urom],
10+
[AS_HELP_STRING([--with-doca_urom=(DIR)], [Enable the use of DOCA_UROM (default is guess).])],
11+
[], [with_doca_urom=guess])
12+
AS_IF([test "x$with_doca_urom" != "xno"],
13+
[
14+
save_CPPFLAGS="$CPPFLAGS"
15+
save_LDFLAGS="$LDFLAGS"
16+
AS_IF([test ! -z "$with_doca_urom" -a "x$with_doca_urom" != "xyes" -a "x$with_doca_urom" != "xguess"],
17+
[
18+
AS_IF([test ! -d $with_doca_urom],
19+
[AC_MSG_ERROR([Provided "--with-doca_urom=${with_doca_urom}" location does not exist])])
20+
check_doca_urom_dir="$with_doca_urom"
21+
check_doca_urom_libdir="$with_doca_urom/lib64"
22+
CPPFLAGS="-I$with_doca_urom/include $UCS_CPPFLAGS $save_CPPFLAGS"
23+
LDFLAGS="-L$check_doca_urom_libdir $save_LDFLAGS"
24+
])
25+
AS_IF([test ! -z "$with_doca_urom_libdir" -a "x$with_doca_urom_libdir" != "xyes"],
26+
[
27+
check_doca_urom_libdir="$with_doca_urom_libdir"
28+
LDFLAGS="-L$check_doca_urom_libdir $save_LDFLAGS"
29+
])
30+
AC_CHECK_HEADERS([doca_urom.h],
31+
[
32+
AC_CHECK_LIB([doca_urom], [doca_urom_service_create],
33+
[
34+
doca_urom_happy="yes"
35+
],
36+
[
37+
doca_urom_happy="no"
38+
], [-ldoca_common -ldoca_argp -ldoca_urom])
39+
],
40+
[
41+
doca_urom_happy="no"
42+
])
43+
AS_IF([test "x$doca_urom_happy" = "xyes"],
44+
[
45+
AS_IF([test "x$check_doca_urom_dir" != "x"],
46+
[
47+
AC_MSG_RESULT([DOCA_UROM dir: $check_doca_urom_dir])
48+
AC_SUBST(DOCA_UROM_CPPFLAGS, "-I$check_doca_urom_dir/include/")
49+
])
50+
AS_IF([test "x$check_doca_urom_libdir" != "x"],
51+
[
52+
AC_SUBST(DOCA_UROM_LDFLAGS, "-L$check_doca_urom_libdir")
53+
])
54+
AC_SUBST(DOCA_UROM_LIBADD, "-ldoca_common -ldoca_argp -ldoca_urom")
55+
AC_DEFINE([HAVE_DOCA_UROM], 1, [Enable DOCA_UROM support])
56+
],
57+
[
58+
AS_IF([test "x$with_doca_urom" != "xguess"],
59+
[
60+
AC_MSG_ERROR([DOCA_UROM support is requested but DOCA_UROM packages cannot be found! $CPPFLAGS $LDFLAGS])
61+
],
62+
[
63+
AC_MSG_WARN([DOCA_UROM not found])
64+
])
65+
])
66+
CPPFLAGS="$save_CPPFLAGS"
67+
LDFLAGS="$save_LDFLAGS"
68+
],
69+
[
70+
AC_MSG_WARN([DOCA_UROM was explicitly disabled])
71+
])
72+
doca_urom_checked=yes
73+
AM_CONDITIONAL([HAVE_DOCA_UROM], [test "x$doca_urom_happy" != xno])
74+
])])

configure.ac

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2001-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright (c) 2001-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# This software product is a proprietary product of Mellanox Technologies Ltd.
44
# (the "Company") and all right, title, and interest and to the software product,
55
# including all associated intellectual property rights, are and shall
@@ -162,6 +162,7 @@ AS_IF([test "x$with_docs_only" = xyes],
162162
AM_CONDITIONAL([HAVE_IBVERBS],[false])
163163
AM_CONDITIONAL([HAVE_RDMACM],[false])
164164
AM_CONDITIONAL([HAVE_MLX5DV],[false])
165+
AM_CONDITIONAL([HAVE_DOCA_UROM], [false])
165166
],
166167
[
167168
AM_CONDITIONAL([DOCS_ONLY], [false])
@@ -172,6 +173,7 @@ AS_IF([test "x$with_docs_only" = xyes],
172173
m4_include([config/m4/cuda.m4])
173174
m4_include([config/m4/nccl.m4])
174175
m4_include([config/m4/rocm.m4])
176+
m4_include([config/m4/doca_urom.m4])
175177
m4_include([config/m4/rccl.m4])
176178
m4_include([config/m4/sharp.m4])
177179
m4_include([config/m4/mpi.m4])
@@ -205,6 +207,9 @@ AS_IF([test "x$with_docs_only" = xyes],
205207
mc_modules="${mc_modules}:rocm"
206208
fi
207209
210+
CHECK_DOCA_UROM
211+
AC_MSG_RESULT([DOCA_UROM support: $doca_urom_happy])
212+
208213
CHECK_GTEST
209214
AC_MSG_RESULT([GTEST support: $gtest_happy])
210215
@@ -224,11 +229,14 @@ LDFLAGS="$LDFLAGS $UCS_LDFLAGS $UCS_LIBADD"
224229
CHECK_TL_COLL_PLUGINS
225230
AC_CONFIG_FILES([
226231
Makefile
232+
contrib/Makefile
233+
contrib/doca_urom_ucc_plugin/Makefile
227234
src/Makefile
228235
src/ucc/api/ucc_version.h
229236
src/core/ucc_version.c
230237
src/components/cl/basic/Makefile
231238
src/components/cl/hier/Makefile
239+
src/components/cl/doca_urom/Makefile
232240
src/components/mc/cpu/Makefile
233241
src/components/mc/cuda/Makefile
234242
src/components/ec/cpu/Makefile
@@ -265,6 +273,7 @@ AC_MSG_NOTICE([ C++ compiler: ${CXX} ${CXXFLAGS} ${BASE_CXXFLAGS}])
265273
AS_IF([test "x$cuda_happy" = "xyes"],[
266274
AC_MSG_NOTICE([ NVCC gencodes: ${NVCC_ARCH}])
267275
])
276+
AC_MSG_NOTICE([ DOCA UROM enabled: ${doca_urom_happy}])
268277
AS_IF([test "x$rocm_happy" = xyes],[
269278
AC_MSG_NOTICE([ROCM architectures: ${ROCM_ARCH}])
270279
])

contrib/Makefile.am

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#
2+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
5+
SUBDIRS = doca_urom_ucc_plugin
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#
2+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
5+
if HAVE_DOCA_UROM
6+
7+
sources = \
8+
common/urom_ucc.h \
9+
dpu/worker_ucc_p2p.c \
10+
dpu/worker_ucc.h \
11+
dpu/worker_ucc.c
12+
13+
plugindir = $(moduledir)/doca_plugins
14+
15+
plugin_LTLIBRARIES = libucc_doca_urom_plugin.la
16+
libucc_doca_urom_plugin_la_SOURCES = $(sources)
17+
libucc_doca_urom_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) $(BASE_CPPFLAGS) $(UCX_CPPFLAGS) $(DOCA_UROM_CPPFLAGS)
18+
libucc_doca_urom_plugin_la_CFLAGS = $(BASE_CFLAGS)
19+
libucc_doca_urom_plugin_la_LDFLAGS = -version-info $(SOVERSION) --as-needed $(UCX_LDFLAGS) $(DOCA_UROM_LDFLAGS)
20+
libucc_doca_urom_plugin_la_LIBADD = $(UCX_LIBADD) $(DOCA_UROM_LIBADD) $(UCC_TOP_BUILDDIR)/src/libucc.la
21+
22+
endif
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
/*
2+
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES, ALL RIGHTS RESERVED.
3+
*
4+
* This software product is a proprietary product of NVIDIA CORPORATION &
5+
* AFFILIATES (the "Company") and all right, title, and interest in and to the
6+
* software product, including all associated intellectual property rights, are
7+
* and shall remain exclusively with the Company.
8+
*
9+
* This software product is governed by the End User License Agreement
10+
* provided with the software product.
11+
*
12+
*/
13+
14+
#ifndef UROM_UCC_H_
15+
#define UROM_UCC_H_
16+
17+
#include <ucp/api/ucp.h>
18+
#include <ucc/api/ucc.h>
19+
20+
#ifdef __cplusplus
21+
extern "C" {
22+
#endif
23+
24+
/* UCC serializing next raw, iter points to the offset place and returns the
25+
buffer start */
26+
#define urom_ucc_serialize_next_raw(_iter, _type, _offset) \
27+
({ \
28+
_type *_result = (_type *)(*(_iter)); \
29+
*(_iter) = UCS_PTR_BYTE_OFFSET(*(_iter), _offset); \
30+
_result; \
31+
})
32+
33+
/* UCC command types */
34+
enum urom_worker_ucc_cmd_type {
35+
UROM_WORKER_CMD_UCC_LIB_CREATE, /* UCC library create command */
36+
UROM_WORKER_CMD_UCC_LIB_DESTROY, /* UCC library destroy command */
37+
UROM_WORKER_CMD_UCC_CONTEXT_CREATE, /* UCC context create command */
38+
UROM_WORKER_CMD_UCC_CONTEXT_DESTROY, /* UCC context destroy command */
39+
UROM_WORKER_CMD_UCC_TEAM_CREATE, /* UCC team create command */
40+
UROM_WORKER_CMD_UCC_COLL, /* UCC collective create command */
41+
UROM_WORKER_CMD_UCC_CREATE_PASSIVE_DATA_CHANNEL, /* UCC passive data channel command */
42+
};
43+
44+
/*
45+
* UCC library create command structure
46+
*
47+
* Input parameters for creating the library handle. The semantics of the
48+
* parameters are defined by ucc.h On successful completion of
49+
* urom_worker_cmd_ucc_lib_create, The UROM worker will generate a notification
50+
* on the notification queue. This notification has reference to local library
51+
* handle on the worker. The implementation can choose to create shadow handles
52+
* or safely pack the library handle on the BlueCC worker to the AEU.
53+
*/
54+
struct urom_worker_cmd_ucc_lib_create {
55+
void *params; /* UCC library parameters */
56+
};
57+
58+
/* UCC context create command structure */
59+
struct urom_worker_cmd_ucc_context_create {
60+
union {
61+
int64_t start; /* The started index */
62+
int64_t *array; /* Set stride to <= 0 if array is used */
63+
};
64+
int64_t stride; /* Set number of strides */
65+
int64_t size; /* Set stride size */
66+
void *base_va; /* Shared buffer address */
67+
uint64_t len; /* Buffer length */
68+
};
69+
70+
/* UCC passive data channel command structure */
71+
struct urom_worker_cmd_ucc_pass_dc {
72+
void *ucp_addr; /* UCP worker address on host */
73+
size_t addr_len; /* UCP worker address length */
74+
};
75+
76+
/* UCC context destroy command structure */
77+
struct urom_worker_cmd_ucc_context_destroy {
78+
ucc_context_h context_h; /* UCC context pointer */
79+
};
80+
81+
/* UCC team create command structure */
82+
struct urom_worker_cmd_ucc_team_create {
83+
int64_t start; /* Team start index */
84+
int64_t stride; /* Number of strides */
85+
int64_t size; /* Stride size */
86+
ucc_context_h context_h; /* UCC context */
87+
};
88+
89+
/* UCC collective command structure */
90+
struct urom_worker_cmd_ucc_coll {
91+
ucc_coll_args_t *coll_args; /* Collective arguments */
92+
ucc_team_h team; /* UCC team */
93+
int use_xgvmi; /* If operation uses XGVMI */
94+
void *work_buffer; /* Work buffer */
95+
size_t work_buffer_size; /* Buffer size */
96+
size_t team_size; /* Team size */
97+
};
98+
99+
/* UROM UCC worker command structure */
100+
struct urom_worker_ucc_cmd {
101+
enum urom_worker_ucc_cmd_type cmd_type;
102+
uint64_t dpu_worker_id; /* DPU worker id as part of the team */
103+
union {
104+
struct urom_worker_cmd_ucc_lib_create lib_create_cmd; /* Lib create command */
105+
struct urom_worker_cmd_ucc_context_create context_create_cmd; /* Context create command */
106+
struct urom_worker_cmd_ucc_context_destroy context_destroy_cmd; /* Context destroy command */
107+
struct urom_worker_cmd_ucc_team_create team_create_cmd; /* Team create command */
108+
struct urom_worker_cmd_ucc_coll coll_cmd; /* UCC collective command */
109+
struct urom_worker_cmd_ucc_pass_dc pass_dc_create_cmd; /* Passive data channel command */
110+
};
111+
};
112+
113+
/* UCC notification types */
114+
enum urom_worker_ucc_notify_type {
115+
UROM_WORKER_NOTIFY_UCC_LIB_CREATE_COMPLETE, /* Create UCC library on DPU notification */
116+
UROM_WORKER_NOTIFY_UCC_LIB_DESTROY_COMPLETE, /* Destroy UCC library on DPU notification */
117+
UROM_WORKER_NOTIFY_UCC_CONTEXT_CREATE_COMPLETE, /* Create UCC context on DPU notification */
118+
UROM_WORKER_NOTIFY_UCC_CONTEXT_DESTROY_COMPLETE, /* Destroy UCC context on DPU notification */
119+
UROM_WORKER_NOTIFY_UCC_TEAM_CREATE_COMPLETE, /* Create UCC team on DPU notification */
120+
UROM_WORKER_NOTIFY_UCC_COLLECTIVE_COMPLETE, /* UCC collective completion notification */
121+
UROM_WORKER_NOTIFY_UCC_PASSIVE_DATA_CHANNEL_COMPLETE, /* UCC data channel completion notification */
122+
};
123+
124+
/* UCC context create notification structure */
125+
struct urom_worker_ucc_notify_context_create {
126+
ucc_context_h context; /* Pointer to UCC context */
127+
};
128+
129+
/* UCC team create notification structure */
130+
struct urom_worker_ucc_notify_team_create {
131+
ucc_team_h team; /* Pointer to UCC team */
132+
};
133+
134+
/* UCC collective notification structure */
135+
struct urom_worker_ucc_notify_collective {
136+
ucc_status_t status; /* UCC collective status */
137+
};
138+
139+
/* UCC passive data channel notification structure */
140+
struct urom_worker_ucc_notify_pass_dc {
141+
ucc_status_t status; /* UCC data channel status */
142+
};
143+
144+
/* UROM UCC worker notification structure */
145+
struct urom_worker_notify_ucc {
146+
enum urom_worker_ucc_notify_type notify_type;
147+
uint64_t dpu_worker_id; /* DPU worker id */
148+
union {
149+
struct urom_worker_ucc_notify_context_create context_create_nqe; /* Context create notification */
150+
struct urom_worker_ucc_notify_team_create team_create_nqe; /* Team create notification */
151+
struct urom_worker_ucc_notify_collective coll_nqe; /* Collective notification */
152+
struct urom_worker_ucc_notify_pass_dc pass_dc_nqe; /* Passive data channel notification */
153+
};
154+
};
155+
156+
typedef struct ucc_worker_key_buf {
157+
size_t src_len;
158+
size_t dst_len;
159+
char rkeys[1024];
160+
} ucc_worker_key_buf;
161+
162+
#ifdef __cplusplus
163+
} /* extern "C" */
164+
#endif
165+
166+
#endif /* UROM_UCC_H_ */

0 commit comments

Comments
 (0)