Skip to content

Commit d30ed1e

Browse files
V0.5
Release Notes: • Added 2 New tests to measure CPU <-> GPU and GPU <-> GPU latencies. • Added a more robust way to detect the GPUs on the system during build phase. • Added an option to output the test results in JSON for machine parsing. • Added PCI bus, device details to device listing in test output • Better Error Handling in the codebase • General Bug fixes.
1 parent ba3a395 commit d30ed1e

25 files changed

+9746
-542
lines changed

CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
88
set(CMAKE_CUDA_STANDARD 17)
99
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
1010

11+
set(supported_archs "52" "70" "75" "80" "86" "89" "90")
12+
13+
message(STATUS "Detecting underlying CUDA Arch to set CMAKE_CUDA_ARCHITECTURES")
14+
include(detect_cuda_arch.cmake)
15+
# Set CMAKE_CUDA_ARCHITECURES based on the underlying device
16+
cuda_detect_architectures(supported_archs CMAKE_CUDA_ARCHITECTURES)
17+
1118
if(NOT CMAKE_BUILD_TYPE)
1219
set(CMAKE_BUILD_TYPE "Release")
1320
endif()
@@ -30,6 +37,9 @@ set(src
3037
kernels.cu
3138
memcpy.cpp
3239
nvbandwidth.cpp
40+
output.cpp
41+
json_output.cpp
42+
json/jsoncpp.cpp
3343
)
3444

3545
execute_process(
@@ -49,3 +59,4 @@ endif()
4959
add_executable(nvbandwidth ${src})
5060
target_include_directories(nvbandwidth PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} .)
5161
target_link_libraries(nvbandwidth Boost::program_options ${NVML_LIB_NAME} cuda)
62+

Licenses.txt

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
JsonCpp:
2+
Copyright Baptiste Lepilleur - Public domain and MIT licenses
3+
Attribution statements: Nvidia actively chooses to accept jsoncpp as public domain where acceptable and MIT licensed where public domain is not accepted.
4+
License text ( https://github.com/open-source-parsers/jsoncpp/blob/master/LICENSE )
5+
6+
/*!
7+
* The JsonCpp library's source code, including accompanying documentation,
8+
* tests and demonstration applications, are licensed under the following
9+
* conditions...
10+
*
11+
* Baptiste Lepilleur and The JsonCpp Authors explicitly disclaim copyright in all
12+
* jurisdictions which recognize such a disclaimer. In such jurisdictions,
13+
* this software is released into the Public Domain.
14+
*
15+
* In jurisdictions which do not recognize Public Domain property (e.g. Germany as of
16+
* 2010), this software is Copyright (c) 2007-2010 by Baptiste Lepilleur and
17+
* The JsonCpp Authors, and is released under the terms of the MIT License (see below).
18+
*
19+
* In jurisdictions which recognize Public Domain property, the user of this
20+
* software may choose to accept it either as 1) Public Domain, 2) under the
21+
* conditions of the MIT License (see below), or 3) under the terms of dual
22+
* Public Domain/MIT License conditions described here, as they choose.
23+
*
24+
* The MIT License is about as close to Public Domain as a license can get, and is
25+
* described in clear, concise terms at:
26+
*
27+
* http://en.wikipedia.org/wiki/MIT_License
28+
*
29+
* The full text of the MIT License follows:
30+
*
31+
* ========================================================================
32+
* Copyright (c) 2007-2010 Baptiste Lepilleur and The JsonCpp Authors
33+
*
34+
* Permission is hereby granted, free of charge, to any person
35+
* obtaining a copy of this software and associated documentation
36+
* files (the "Software"), to deal in the Software without
37+
* restriction, including without limitation the rights to use, copy,
38+
* modify, merge, publish, distribute, sublicense, and/or sell copies
39+
* of the Software, and to permit persons to whom the Software is
40+
* furnished to do so, subject to the following conditions:
41+
*
42+
* The above copyright notice and this permission notice shall be
43+
* included in all copies or substantial portions of the Software.
44+
*
45+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
46+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
47+
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
48+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
49+
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
50+
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
51+
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
52+
* SOFTWARE.
53+
* ========================================================================
54+
* (END LICENSE TEXT)
55+
*
56+
* The MIT license is compatible with both the GPL and commercial
57+
* software, affording one all of the rights of Public Domain with the
58+
* minor nuisance of being required to keep the above copyright notice
59+
* and license text in the source code. Note also that by accepting the
60+
* Public Domain "license" you can re-license your copy using whatever
61+
* license you like.
62+
*/
63+

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@ Install a cuda toolkit (version 11.X or above)
1010

1111
Install a compiler package which supports c++17. GCC 7.x or above is a possible option.
1212

13-
Install cmake (version 3.20 or above)
13+
Install cmake (version 3.20 or above).
14+
Cmake version 3.24 or newer is encouraged.
1415

1516
Install Boost program options library (More details in the next section)
1617

1718
Ensure that path to nvcc binary (install via toolkit) is available in the $PATH variable on linux systems
18-
19+
In order to run nvbandwidth, the system should have a CUDA enabled GPU and an NVIDIA display driver that is compatible with the CUDA Toolkit being used to build nvbandwidth.
20+
For more information, refer to https://docs.nvidia.com/deploy/cuda-compatibility/
1921

2022
## Dependencies
2123
To build and run nvbandwidth please install the Boost program_options library (https://www.boost.org/doc/libs/1_66_0/doc/html/program_options.html).

common.h

Lines changed: 27 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
#ifndef COMMON_H
1919
#define COMMON_H
2020

21-
#include <cassert>
2221
#include <cmath>
2322
#include <cstdlib>
2423
#include <cuda.h>
@@ -36,30 +35,51 @@
3635
#include <optional>
3736
#include <cstring>
3837

38+
#define STRING_LENGTH 256
39+
3940
// Default constants
4041
const unsigned long long defaultLoopCount = 16;
4142
const unsigned long long defaultBufferSize = 64; // 64MB
4243
const unsigned int defaultAverageLoopCount = 3;
4344
const unsigned int _MiB = 1024 * 1024;
4445
const unsigned int numThreadPerBlock = 512;
45-
46+
const unsigned int strideLen = 16; /* cacheLine size 128 Bytes, 16 words */
47+
const unsigned int latencyMemAccessCnt = 100000; /* 100k read accesses to gauge latency */
4648
extern int deviceCount;
4749
extern unsigned int averageLoopCount;
4850
extern bool disableAffinity;
4951
extern bool skipVerification;
5052
extern bool useMean;
53+
extern bool jsonOutput;
5154
// Verbosity
5255
extern bool verbose;
56+
5357
class Verbosity {
5458
public:
55-
Verbosity() = default;
59+
bool &controlVariable;
60+
61+
Verbosity(bool &controlVariable): controlVariable(controlVariable) {};
62+
5663
template<typename T>
5764
Verbosity& operator<<(T input) {
58-
if (verbose) std::cout << input;
59-
return *this;
65+
if (!jsonOutput && controlVariable) std::cout << input;
66+
return *this;
67+
}
68+
69+
using StreamType = decltype(std::cout);
70+
Verbosity &operator<<(StreamType &(*func)(StreamType &)) {
71+
if (!jsonOutput && controlVariable) {
72+
func(std::cout);
73+
}
74+
return *this;
6075
}
6176
};
6277
extern Verbosity VERBOSE;
78+
extern Verbosity OUTPUT;
79+
80+
#ifdef _MSC_VER
81+
#define __PRETTY_FUNCTION__ __FUNCTION__
82+
#endif
6383

6484
// Rounds n up to the nearest multiple of "multiple".
6585
// if n is already a multiple of "multiple", n is returned unchanged.
@@ -175,130 +195,9 @@ class PerformanceStatistic {
175195
}
176196
};
177197

178-
template <class T> struct PeerValueMatrix {
179-
std::optional <T> *m_matrix;
180-
int m_rows, m_columns;
181-
std::string key;
182-
183-
PeerValueMatrix(int rows, int columns, std::string key = ""): m_matrix(new std::optional <T>[rows * columns]()), m_rows(rows), m_columns(columns), key(key) {}
184-
185-
~PeerValueMatrix() { delete[] m_matrix; }
186-
std::optional <T> &value(int src, int dst) {
187-
assert(src >= 0 && src < m_rows);
188-
assert(dst >= 0 && dst < m_columns);
189-
return m_matrix[src * m_columns + dst];
190-
}
191-
const std::optional <T> &value(int src, int dst) const {
192-
assert(src >= 0 && src < m_rows);
193-
assert(dst >= 0 && dst < m_columns);
194-
return m_matrix[src * m_columns + dst];
195-
}
198+
struct LatencyNode {
199+
struct LatencyNode *next;
196200
};
197201

198-
template <class T>
199-
std::ostream &operator<<(std::ostream &o, const PeerValueMatrix<T> &matrix) {
200-
// This assumes T is numeric
201-
T maxVal = std::numeric_limits<T>::min();
202-
T minVal = std::numeric_limits<T>::max();
203-
T sum = 0;
204-
int count = 0;
205-
206-
o << ' ';
207-
for (int currentDevice = 0; currentDevice < matrix.m_columns; currentDevice++) {
208-
o << std::setw(10) << currentDevice;
209-
}
210-
o << std::endl;
211-
for (int currentDevice = 0; currentDevice < matrix.m_rows; currentDevice++) {
212-
o << currentDevice;
213-
for (int peer = 0; peer < matrix.m_columns; peer++) {
214-
std::optional <T> val = matrix.value(currentDevice, peer);
215-
if (val) {
216-
o << std::setw(10) << val.value();
217-
}
218-
else {
219-
o << std::setw(10) << "N/A";
220-
}
221-
sum += val.value_or(0.0);
222-
maxVal = std::max(maxVal, val.value_or(0.0));
223-
minVal = std::min(minVal, val.value_or(0.0));
224-
if (val.value_or(0.0) > 0) count++;
225-
}
226-
o << std::endl;
227-
}
228-
o << std::endl;
229-
o << "SUM " << matrix.key << " " << sum << std::endl;
230-
231-
VERBOSE << "MIN " << matrix.key << " " << minVal << '\n';
232-
VERBOSE << "MAX " << matrix.key << " " << maxVal << '\n';
233-
VERBOSE << "AVG " << matrix.key << " " << sum / count << '\n';
234-
return o;
235-
}
236-
237-
// CUDA Error handling
238-
inline void CU_ASSERT(CUresult cuResult, const char *msg = nullptr) {
239-
if (cuResult != CUDA_SUCCESS) {
240-
const char *errDescStr, *errNameStr;
241-
cuGetErrorString(cuResult, &errDescStr);
242-
cuGetErrorName(cuResult, &errNameStr);
243-
std::cout << "[" << errNameStr << "] " << errDescStr;
244-
if (msg != nullptr) std::cout << ":\n\t" << msg;
245-
std::cout << std::endl;
246-
std::exit(1);
247-
}
248-
}
249-
250-
// NVML Error handling
251-
inline void NVML_ASSERT(nvmlReturn_t nvmlResult, const char *msg = nullptr) {
252-
if (nvmlResult != NVML_SUCCESS) {
253-
std::cout << "NVML_ERROR: [" << nvmlErrorString(nvmlResult) << "]";
254-
if (msg != nullptr) std::cout << ":\n\t" << msg;
255-
std::cout << std::endl;
256-
std::exit(1);
257-
}
258-
}
259-
260-
// NUMA optimal affinity
261-
inline void setOptimalCpuAffinity(int cudaDeviceID) {
262-
#ifdef _WIN32
263-
// NVML doesn't support setting affinity on Windows
264-
return;
265-
#endif
266-
if (disableAffinity) {
267-
return;
268-
}
269-
270-
nvmlDevice_t device;
271-
CUuuid dev_uuid;
272-
273-
std::stringstream s;
274-
std::unordered_set <unsigned char> dashPos {0, 4, 6, 8, 10};
275-
276-
CU_ASSERT(cuDeviceGetUuid(&dev_uuid, cudaDeviceID));
277-
278-
s << "GPU";
279-
for (int i = 0; i < 16; i++) {
280-
if (dashPos.count(i)) {
281-
s << '-';
282-
}
283-
s << std::hex << std::setfill('0') << std::setw(2) << (0xFF & (int)dev_uuid.bytes[i]);
284-
}
285-
286-
NVML_ASSERT(nvmlDeviceGetHandleByUUID(s.str().c_str(), &device));
287-
nvmlReturn_t result = nvmlDeviceSetCpuAffinity(device);
288-
if (result != NVML_ERROR_NOT_SUPPORTED) {
289-
NVML_ASSERT(result);
290-
}
291-
}
292-
293-
inline bool isMemoryOwnedByCUDA(void *memory) {
294-
CUmemorytype memorytype;
295-
CUresult status = cuPointerGetAttribute(&memorytype, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)memory);
296-
if (status == CUDA_ERROR_INVALID_VALUE) {
297-
return false;
298-
} else {
299-
CU_ASSERT(status);
300-
return true;
301-
}
302-
}
303202

304203
#endif

debian_install.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
# necessary software components needed to
44
# build nvbandwidth
55

6-
apt install build-essential
7-
apt install libboost-program-options-dev
8-
apt install cmake
6+
apt install -y build-essential
7+
apt install -y libboost-program-options-dev
8+
apt install -y cmake
99
output=$(cmake --version | sed -n 1p | sed 's/[^0-9]*//g')
1010
if [ $output -lt 3200 ]; then
1111
echo "Upgrade cmake version to 3.20 or above to build nvbandwidth"

detect_cuda_arch.cmake

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
include_guard(GLOBAL)
2+
3+
# Function uses the CUDA runtime API to query the compute capability of the device, so if a user
4+
# doesn't pass any architecture options to CMake we only build the current architecture
5+
6+
# Adapted from https://github.com/rapidsai/rapids-cmake/blob/branch-24.04/rapids-cmake/cuda/detail/detect_architectures.cmake
7+
8+
function(cuda_detect_architectures possible_archs_var gpu_archs)
9+
10+
set(__gpu_archs ${${possible_archs_var}})
11+
12+
set(eval_file eval_gpu_archs.cu)
13+
set(eval_exe eval_gpu_archs)
14+
set(error_file eval_gpu_archs.stderr.log)
15+
16+
if(NOT DEFINED CMAKE_CUDA_COMPILER)
17+
message(FATAL_ERROR "No CUDA compiler specified, unable to determine machine's GPUs.")
18+
endif()
19+
20+
if(NOT EXISTS "${eval_exe}")
21+
file(WRITE ${eval_file}
22+
"
23+
#include <cstdio>
24+
#include <set>
25+
#include <string>
26+
using namespace std;
27+
int main(int argc, char** argv) {
28+
set<string> archs;
29+
int nDevices;
30+
if((cudaGetDeviceCount(&nDevices) == cudaSuccess) && (nDevices > 0)) {
31+
for(int dev=0;dev<nDevices;++dev) {
32+
char buff[32];
33+
cudaDeviceProp prop;
34+
if(cudaGetDeviceProperties(&prop, dev) != cudaSuccess) continue;
35+
sprintf(buff, \"%d%d\", prop.major, prop.minor);
36+
archs.insert(buff);
37+
}
38+
}
39+
if(archs.empty()) {
40+
printf(\"${__gpu_archs}\");
41+
} else {
42+
bool first = true;
43+
for(const auto& arch : archs) {
44+
printf(first? \"%s\" : \";%s\", arch.c_str());
45+
first = false;
46+
}
47+
}
48+
printf(\"\\n\");
49+
return 0;
50+
}
51+
")
52+
execute_process(COMMAND ${CMAKE_CUDA_COMPILER} -std=c++11 -o "${eval_exe}" "${eval_file}"
53+
ERROR_FILE "${error_file}")
54+
endif()
55+
56+
if(EXISTS "${eval_exe}")
57+
execute_process(COMMAND "./${eval_exe}" OUTPUT_VARIABLE __gpu_archs
58+
OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_FILE "${error_file}")
59+
message(STATUS "Auto detection of gpu-archs: ${__gpu_archs}")
60+
else()
61+
message(STATUS "Failed auto detection of gpu-archs. Falling back to using ${__gpu_archs}.")
62+
endif()
63+
# remove the build artifacts
64+
file(REMOVE "${eval_file}" "${eval_exe}" "${error_file}")
65+
set(${gpu_archs} ${__gpu_archs} PARENT_SCOPE)
66+
67+
endfunction()

0 commit comments

Comments
 (0)