NVIDIA
diff --git a/‎CMakeLists.txt‎
Lines changed: 11 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎Licenses.txt‎
Lines changed: 63 additions & 0 deletions b/‎Licenses.txt‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 2 deletions b/‎README.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎common.h‎
Lines changed: 27 additions & 128 deletions b/‎common.h‎
Lines changed: 27 additions & 128 deletions
diff --git a/‎debian_install.sh‎
Lines changed: 3 additions & 3 deletions b/‎debian_install.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎detect_cuda_arch.cmake‎
Lines changed: 67 additions & 0 deletions b/‎detect_cuda_arch.cmake‎
Lines changed: 67 additions & 0 deletions
@@ -8,6 +8,13 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
+set(supported_archs "52" "70" "75" "80" "86" "89" "90")
+
+message(STATUS "Detecting underlying CUDA Arch to set CMAKE_CUDA_ARCHITECTURES")
+include(detect_cuda_arch.cmake)
+# Set CMAKE_CUDA_ARCHITECURES based on the underlying device
+cuda_detect_architectures(supported_archs CMAKE_CUDA_ARCHITECTURES)
+
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release")
 endif()
@@ -30,6 +37,9 @@ set(src
     kernels.cu
     memcpy.cpp
     nvbandwidth.cpp
+    output.cpp
+    json_output.cpp
+    json/jsoncpp.cpp
 )
 
 execute_process(
@@ -49,3 +59,4 @@ endif()
 add_executable(nvbandwidth ${src})
 target_include_directories(nvbandwidth PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} .)
 target_link_libraries(nvbandwidth Boost::program_options ${NVML_LIB_NAME} cuda)
+
@@ -0,0 +1,63 @@
+JsonCpp:
+Copyright Baptiste Lepilleur - Public domain and MIT licenses
+Attribution statements: Nvidia actively chooses to accept jsoncpp as public domain where acceptable and MIT licensed where public domain is not accepted.
+License text ( https://github.com/open-source-parsers/jsoncpp/blob/master/LICENSE ) 
+
+/*!
+* The JsonCpp library's source code, including accompanying documentation,
+* tests and demonstration applications, are licensed under the following
+* conditions...
+* 
+* Baptiste Lepilleur and The JsonCpp Authors explicitly disclaim copyright in all
+* jurisdictions which recognize such a disclaimer. In such jurisdictions,
+* this software is released into the Public Domain.
+* 
+* In jurisdictions which do not recognize Public Domain property (e.g. Germany as of
+*         2010), this software is Copyright (c) 2007-2010 by Baptiste Lepilleur and
+* The JsonCpp Authors, and is released under the terms of the MIT License (see below).
+* 
+* In jurisdictions which recognize Public Domain property, the user of this
+* software may choose to accept it either as 1) Public Domain, 2) under the
+* conditions of the MIT License (see below), or 3) under the terms of dual
+* Public Domain/MIT License conditions described here, as they choose.
+* 
+* The MIT License is about as close to Public Domain as a license can get, and is
+* described in clear, concise terms at:
+* 
+*    http://en.wikipedia.org/wiki/MIT_License
+* 
+*    The full text of the MIT License follows:
+* 
+*    ========================================================================
+*    Copyright (c) 2007-2010 Baptiste Lepilleur and The JsonCpp Authors
+* 
+*    Permission is hereby granted, free of charge, to any person
+*    obtaining a copy of this software and associated documentation
+*    files (the "Software"), to deal in the Software without
+*    restriction, including without limitation the rights to use, copy,
+*    modify, merge, publish, distribute, sublicense, and/or sell copies
+*    of the Software, and to permit persons to whom the Software is
+*    furnished to do so, subject to the following conditions:
+* 
+*    The above copyright notice and this permission notice shall be
+*    included in all copies or substantial portions of the Software.
+* 
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+*    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+*    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+*    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+*    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+*    SOFTWARE.
+*    ========================================================================
+*    (END LICENSE TEXT)
+* 
+* The MIT license is compatible with both the GPL and commercial
+* software, affording one all of the rights of Public Domain with the
+* minor nuisance of being required to keep the above copyright notice
+* and license text in the source code. Note also that by accepting the
+* Public Domain "license" you can re-license your copy using whatever
+* license you like.
+*/
+
@@ -10,12 +10,14 @@ Install a cuda toolkit (version 11.X or above)
 
 Install a compiler package which supports c++17. GCC 7.x or above is a possible option.
 
-Install cmake (version 3.20 or above)
+Install cmake (version 3.20 or above).
+Cmake version 3.24 or newer is encouraged.
 
 Install Boost program options library (More details in the next section)
 
 Ensure that path to nvcc binary (install via toolkit) is available in the $PATH variable on linux systems
-
+In order to run nvbandwidth, the system should have a CUDA enabled GPU and an NVIDIA display driver that is compatible with the CUDA Toolkit being used to build nvbandwidth.
+For more information, refer to https://docs.nvidia.com/deploy/cuda-compatibility/
 
 ## Dependencies
 To build and run nvbandwidth please install the Boost program_options library (https://www.boost.org/doc/libs/1_66_0/doc/html/program_options.html).
 
@@ -18,7 +18,6 @@
 #ifndef COMMON_H
 #define COMMON_H
 
-#include <cassert>
 #include <cmath>
 #include <cstdlib>
 #include <cuda.h>
@@ -36,30 +35,51 @@
 #include <optional>
 #include <cstring>
 
+#define STRING_LENGTH 256
+
 // Default constants
 const unsigned long long defaultLoopCount = 16;
 const unsigned long long defaultBufferSize = 64; // 64MB
 const unsigned int defaultAverageLoopCount = 3;
 const unsigned int _MiB = 1024 * 1024;
 const unsigned int numThreadPerBlock = 512;
-
+const unsigned int strideLen = 16; /* cacheLine size 128 Bytes, 16 words */
+const unsigned int latencyMemAccessCnt = 100000; /* 100k  read accesses to gauge latency */
 extern int deviceCount;
 extern unsigned int averageLoopCount;
 extern bool disableAffinity;
 extern bool skipVerification;
 extern bool useMean;
+extern bool jsonOutput;
 // Verbosity
 extern bool verbose;
+
 class Verbosity {
 public:    
-    Verbosity() = default;
+    bool &controlVariable;
+    
+    Verbosity(bool &controlVariable): controlVariable(controlVariable) {};
+
     template<typename T>
     Verbosity& operator<<(T input) {
-      if (verbose) std::cout << input;
-      return *this;
+        if (!jsonOutput && controlVariable) std::cout << input;
+        return *this;
+    }
+
+    using StreamType = decltype(std::cout);
+    Verbosity &operator<<(StreamType &(*func)(StreamType &)) {
+        if (!jsonOutput && controlVariable) {
+            func(std::cout);
+        }
+        return *this;
     }
 };
 extern Verbosity VERBOSE;
+extern Verbosity OUTPUT;
+
+#ifdef _MSC_VER
+#define __PRETTY_FUNCTION__ __FUNCTION__
+#endif
 
 // Rounds n up to the nearest multiple of "multiple".
 // if n is already a multiple of "multiple", n is returned unchanged.
@@ -175,130 +195,9 @@ class PerformanceStatistic {
     }
 };
 
-template <class T> struct PeerValueMatrix {
-    std::optional <T> *m_matrix;
-    int m_rows, m_columns;
-    std::string key;
-
-    PeerValueMatrix(int rows, int columns, std::string key = ""): m_matrix(new std::optional <T>[rows * columns]()), m_rows(rows), m_columns(columns), key(key) {}
-
-    ~PeerValueMatrix() { delete[] m_matrix; }
-    std::optional <T> &value(int src, int dst) {
-        assert(src >= 0 && src < m_rows);
-        assert(dst >= 0 && dst < m_columns);
-        return m_matrix[src * m_columns + dst];
-    }
-    const std::optional <T> &value(int src, int dst) const {
-        assert(src >= 0 && src < m_rows);
-        assert(dst >= 0 && dst < m_columns);
-        return m_matrix[src * m_columns + dst];
-    }
+struct LatencyNode {
+    struct LatencyNode *next;
 };
 
-template <class T>
-std::ostream &operator<<(std::ostream &o, const PeerValueMatrix<T> &matrix) {
-    // This assumes T is numeric
-    T maxVal = std::numeric_limits<T>::min();
-    T minVal = std::numeric_limits<T>::max();
-    T sum = 0;
-    int count = 0;
-
-    o << ' ';
-    for (int currentDevice = 0; currentDevice < matrix.m_columns; currentDevice++) {
-        o << std::setw(10) << currentDevice;
-    }
-    o << std::endl;
-    for (int currentDevice = 0; currentDevice < matrix.m_rows; currentDevice++) {
-        o << currentDevice;
-        for (int peer = 0; peer < matrix.m_columns; peer++) {
-            std::optional <T> val = matrix.value(currentDevice, peer);
-            if (val) {
-                o << std::setw(10) << val.value();
-            }
-            else {
-                o << std::setw(10) << "N/A";
-            }
-            sum += val.value_or(0.0);
-            maxVal = std::max(maxVal, val.value_or(0.0));
-            minVal = std::min(minVal, val.value_or(0.0));
-            if (val.value_or(0.0) > 0) count++;
-        }
-        o << std::endl;
-    }
-    o << std::endl;
-    o << "SUM " << matrix.key << " " << sum << std::endl;
-
-    VERBOSE << "MIN " << matrix.key << " " << minVal << '\n';
-    VERBOSE << "MAX " << matrix.key << " " << maxVal << '\n';
-    VERBOSE << "AVG " << matrix.key << " " << sum / count << '\n';
-    return o;
-}
-
-// CUDA Error handling
-inline void CU_ASSERT(CUresult cuResult, const char *msg = nullptr) {
-    if (cuResult != CUDA_SUCCESS) {
-        const char *errDescStr, *errNameStr;
-        cuGetErrorString(cuResult, &errDescStr);
-        cuGetErrorName(cuResult, &errNameStr);
-        std::cout << "[" << errNameStr << "] " << errDescStr;
-        if (msg != nullptr) std::cout << ":\n\t" << msg;
-        std::cout << std::endl;
-        std::exit(1);
-  }
-}
-
-// NVML Error handling
-inline void NVML_ASSERT(nvmlReturn_t nvmlResult, const char *msg = nullptr) {
-    if (nvmlResult != NVML_SUCCESS) {
-        std::cout << "NVML_ERROR: [" << nvmlErrorString(nvmlResult) << "]";
-        if (msg != nullptr) std::cout << ":\n\t" << msg;
-        std::cout << std::endl;
-        std::exit(1);
-    }
-}
-
-// NUMA optimal affinity
-inline void setOptimalCpuAffinity(int cudaDeviceID) {
-#ifdef _WIN32
-    // NVML doesn't support setting affinity on Windows
-    return;
-#endif
-    if (disableAffinity) {
-        return;
-    }
-
-    nvmlDevice_t device;
-    CUuuid dev_uuid;
-
-    std::stringstream s;
-    std::unordered_set <unsigned char> dashPos {0, 4, 6, 8, 10};
-
-    CU_ASSERT(cuDeviceGetUuid(&dev_uuid, cudaDeviceID));
-
-    s << "GPU";
-    for (int i = 0; i < 16; i++) {
-        if (dashPos.count(i)) {
-            s << '-';
-        }
-        s << std::hex << std::setfill('0') << std::setw(2) << (0xFF & (int)dev_uuid.bytes[i]);
-    }
-
-    NVML_ASSERT(nvmlDeviceGetHandleByUUID(s.str().c_str(), &device));
-    nvmlReturn_t result = nvmlDeviceSetCpuAffinity(device);
-    if (result != NVML_ERROR_NOT_SUPPORTED) {
-        NVML_ASSERT(result);
-    }
-}
-
-inline bool isMemoryOwnedByCUDA(void *memory) {
-    CUmemorytype memorytype;
-    CUresult status = cuPointerGetAttribute(&memorytype, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)memory);
-    if (status == CUDA_ERROR_INVALID_VALUE) {
-        return false;
-    } else {
-        CU_ASSERT(status);
-        return true;
-    }
-}
 
 #endif
@@ -3,9 +3,9 @@
 # necessary software components needed to
 # build nvbandwidth
 
-apt install build-essential
-apt install libboost-program-options-dev
-apt install cmake
+apt install -y build-essential
+apt install -y libboost-program-options-dev
+apt install -y cmake
 output=$(cmake --version | sed -n 1p | sed 's/[^0-9]*//g')
 if [ $output -lt 3200 ]; then
     echo "Upgrade cmake version to 3.20 or above to build nvbandwidth"
 
@@ -0,0 +1,67 @@
+include_guard(GLOBAL)
+
+# Function uses the CUDA runtime API to query the compute capability of the device, so if a user
+# doesn't pass any architecture options to CMake we only build the current architecture
+
+# Adapted from https://github.com/rapidsai/rapids-cmake/blob/branch-24.04/rapids-cmake/cuda/detail/detect_architectures.cmake
+
+function(cuda_detect_architectures possible_archs_var gpu_archs)
+
+  set(__gpu_archs ${${possible_archs_var}})
+
+  set(eval_file eval_gpu_archs.cu)
+  set(eval_exe eval_gpu_archs)
+  set(error_file eval_gpu_archs.stderr.log)
+
+  if(NOT DEFINED CMAKE_CUDA_COMPILER)
+    message(FATAL_ERROR "No CUDA compiler specified, unable to determine machine's GPUs.")
+  endif()
+
+  if(NOT EXISTS "${eval_exe}")
+    file(WRITE ${eval_file}
+         "
+#include <cstdio>
+#include <set>
+#include <string>
+using namespace std;
+int main(int argc, char** argv) {
+  set<string> archs;
+  int nDevices;
+  if((cudaGetDeviceCount(&nDevices) == cudaSuccess) && (nDevices > 0)) {
+    for(int dev=0;dev<nDevices;++dev) {
+      char buff[32];
+      cudaDeviceProp prop;
+      if(cudaGetDeviceProperties(&prop, dev) != cudaSuccess) continue;
+      sprintf(buff, \"%d%d\", prop.major, prop.minor);
+      archs.insert(buff);
+    }
+  }
+  if(archs.empty()) {
+    printf(\"${__gpu_archs}\");
+  } else {
+    bool first = true;
+    for(const auto& arch : archs) {
+      printf(first? \"%s\" : \";%s\", arch.c_str());
+      first = false;
+    }
+  }
+  printf(\"\\n\");
+  return 0;
+  }
+  ")
+    execute_process(COMMAND ${CMAKE_CUDA_COMPILER} -std=c++11 -o "${eval_exe}" "${eval_file}"
+                    ERROR_FILE "${error_file}")
+  endif()
+
+  if(EXISTS "${eval_exe}")
+    execute_process(COMMAND "./${eval_exe}" OUTPUT_VARIABLE __gpu_archs
+                    OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_FILE "${error_file}")
+    message(STATUS "Auto detection of gpu-archs: ${__gpu_archs}")
+  else()
+    message(STATUS "Failed auto detection of gpu-archs. Falling back to using ${__gpu_archs}.")
+  endif()
+  # remove the build artifacts
+  file(REMOVE "${eval_file}" "${eval_exe}" "${error_file}")
+  set(${gpu_archs} ${__gpu_archs} PARENT_SCOPE)
+
+endfunction()