Use machine-readable (YAML) for rank 0 output

Add MPI device-aware support for Kokkos and OMP target
UoB-HPC · Jul 26, 2023 · 6f0ac2e · 6f0ac2e
1 parent 28c5ef2
commit 6f0ac2e
Show file tree

Hide file tree

Showing 22 changed files with 405 additions and 236 deletions.
diff --git a/.gitignore b/.gitignore
@@ -34,3 +34,5 @@ out/
 .directory
 
 clover.out
+
+.gdb_history
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,9 +16,9 @@ include(FetchContent)
 #set(MODEL hip)
 ######
 
-#if (NOT MODEL)
-#    set(MODEL std-indices)
-#    #    set(NVHPC_OFFLOAD cc60)
+if (NOT MODEL)
+    set(MODEL std-indices)
+    #    set(NVHPC_OFFLOAD cc60)
 #        set(CXX_EXTRA_FLAGS
 #                -stdpar
 #                -gpu=cc61
@@ -27,9 +27,9 @@ include(FetchContent)
 #                -Ktrap=none
 #                -Minfo=accel
 #                -Minfo=stdpar)
-##    set(USE_TBB ON)
-#    set(ENABLE_MPI ON)
-#endif ()
+    set(USE_TBB ON)
+    set(ENABLE_MPI ON)
+endif ()
 
 #if (NOT MODEL)
 #    set(MODEL hip)
@@ -66,18 +66,21 @@ include(FetchContent)
 #if (NOT MODEL)
 #    set(MODEL omp-target)
 #    set(ENABLE_MPI ON)
+##    set(CXX_EXTRA_FLAGS -fopenmp -foffload=nvptx-none -foffload=-lm -fno-fast-math -fno-associative-math)
+##    set(CXX_EXTRA_LINK_FLAGS -fopenmp -foffload=nvptx-none -foffload=-lm -fno-fast-math -fno-associative-math)
+#    set(CXX_EXTRA_LIBRARIES atomic)
 #    set(MPI_HOME /usr/lib64/openmpi/)
 #endif ()
 
 ########
 ######
-if (NOT MODEL)
-    set(MODEL cuda)
-    set(CMAKE_CUDA_COMPILER /opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin/nvcc)
-    set(CUDA_ARCH sm_60)
-    set(ENABLE_PROFILING ON)
-    set(MPI_HOME /usr/lib64/openmpi/)
-endif ()
+#if (NOT MODEL)
+#    set(MODEL cuda)
+#    set(CMAKE_CUDA_COMPILER /opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin/nvcc)
+#    set(CUDA_ARCH sm_60)
+#    set(ENABLE_PROFILING ON)
+#    set(MPI_HOME /usr/lib64/openmpi/)
+#endif ()
 
 ######
 #set(MODEL sycl)
@@ -191,7 +194,7 @@ if (USE_ONEDPL)
     FetchContent_Declare(
             oneDPL
             GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git
-            GIT_TAG oneDPL-2022.1.0-rc3
+            GIT_TAG oneDPL-2022.2.0-rc1
     )
     string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND)
     # XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package

diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ particular order:
 - Kokkos
 - SYCL and SYCL 2020
 
-TODO:
+Planned:
 
 - OpenACC
 - RAJA
@@ -65,3 +65,97 @@ $ ./build/<model>-cloverleaf
 The `MODEL` option selects one implementation of BabelStream to build.
 The source for each model's implementations are located in `./src/<model>`.
 
+## Running
+
+CloverLeaf supports the following options:
+
+```
+Usage: --help [OPTIONS]
+
+Options:
+  -h  --help                             Print this message
+      --list                             List available devices with index and exit
+      --device           <INDEX|NAME>    Use device at INDEX from output of --list or substring match iff INDEX is not an id
+      --file,--in              <FILE>    Custom clover.in file FILE (defaults to clover.in if unspecified)
+      --out                    <FILE>    Custom clover.out file FILE (defaults to clover.out if unspecified)
+      --dump                    <DIR>    Dumps all field data in ASCII to ./DIR for debugging, DIR is created if missing
+      --profile                          Enables kernel profiling, this takes precedence over the profiler_on in clover.in
+      --staging-buffer <true|false|auto> If true, use a host staging buffer for device-host MPI halo exchange.
+                                         If false, use device pointers directly for MPI halo exchange.
+                                         Defaults to auto which elides the buffer if a device-aware (i.e CUDA-aware) is used.
+                                         This option is no-op for CPU-only models.
+                                         Setting this to false on an MPI that is not device-aware may cause a segfault.
+
+
+```
+
+For example
+
+The output on stdout is machine-readable in YAML format where the `Output` key contains CloverLeaf
+1.3's output format.
+For example, here's the output
+of `mpirun -np 3 kokkos_cloverleaf --device 0 --file InputDecks/clover_bm_short.in --profile true`:
+
+```yaml
+---
+Devices:
+  0: N6Kokkos4CudaE
+CloverLeaf:
+  - Ver.: 2.000
+  - Deck: InputDecks/clover_bm_short.in
+  - Out: clover.out
+  - Profiler: true
+MPI:
+  - Enabled: true
+  - Total ranks: 3
+  - Header device-awareness (CUDA-awareness): true
+  - Runtime device-awareness (CUDA-awareness): true
+  - Host-Device halo exchange staging buffer: false
+Model:
+  - Name: Kokkos 4.0.1
+  - Execution: Offload (device)
+  - Backend space: N6Kokkos4CudaE
+  - Backend host space: N6Kokkos6SerialE
+# ---- 
+Output: |+1
+ Output file clover.out opened. All output will go there.
+ Args: --device 0 --file InputDecks/clover_bm_short.in --profile true
+ Using input: `InputDecks/clover_bm_short.in`
+ Problem initialised and generated
+ Launching hydro
+ Step 1 time 0 control sound timestep  0.00616258 1,1 x 0 y 0
+ Wall clock 0.0259612
+ ...... 
+ Step 86 time 0.491277 control sound timestep  0.00584781 1,1 x 0 y 0
+ Wall clock 1.42524
+ Average time per cell 1.79824e-08
+  Step time per cell    1.69889e-08
+ Step 87 time 0.497124 control sound timestep  0.005848 1,1 x 0 y 0
+ Test problem 2 is within 1.17018e-11% of the expected solution
+ This test is considered PASSED
+ Wall clock 1.44286
+ First step overhead 0
+
+ Profiler Output        Time     Percentage
+ Timestep              :0.110086 7.629754
+ Ideal Gas             :0.000370 0.025662
+ Viscosity             :0.001094 0.075812
+ PdV                   :0.058765 4.072801
+ Revert                :0.000815 0.056463
+ Acceleration          :0.001175 0.081414
+ Fluxes                :0.001452 0.100665
+ Cell Advection        :0.001999 0.138538
+ Momentum Advection    :0.003294 0.228296
+ Reset                 :0.002566 0.177848
+ Summary               :0.014976 1.037959
+ Visit                 :0.000000 0.000000
+ Tile Halo Exchange    :0.000016 0.001107
+ Self Halo Exchange    :0.009350 0.648008
+ MPI Halo Exchange     :1.236754 85.715627
+ Total                 :1.442712 99.989953
+ The Rest              :0.000145 0.010047
+
+Result:
+  - Problem: 2
+  - Outcome: PASSED
+```
diff --git a/cuda/initialise.cpp b/cuda/initialise.cpp
@@ -22,7 +22,7 @@
 #include "initialise.h"
 #include "start.h"
 
-std::pair<clover::context, run_args> create_context(bool silent, const std::vector<std::string> &args) {
+model create_context(bool silent, const std::vector<std::string> &args) {
   struct Device {
     int id{};
     std::string name{};
@@ -41,16 +41,14 @@ std::pair<clover::context, run_args> create_context(bool silent, const std::vect
   auto [device, parsed] = list_and_parse<Device>(
       silent, devices, [](auto &d) { return d.name; }, args);
   clover::checkError(cudaSetDevice(device.id));
-  return {clover::context{}, parsed};
+  return model{clover::context{}, "CUDA", true, parsed};
 }
 
 void report_context(const clover::context &) {
   int device = -1;
   clover::checkError(cudaGetDevice(&device));
   cudaDeviceProp props{};
   clover::checkError(cudaGetDeviceProperties(&props, device));
-
-  std::cout << "Using CUDA:" << std::endl;
   std::cout << " - Device: " //
             << props.name << " (" << (props.totalGlobalMem / 1024 / 1024) << "MB;"
             << "sm_" << props.major << props.minor << ")" << std::endl;

diff --git a/driver/clover_leaf.cpp b/driver/clover_leaf.cpp
@@ -54,9 +54,11 @@ std::ofstream of;
 
 global_variables initialise(parallel_ &parallel, const std::vector<std::string> &args) {
   global_config config;
-
-  auto &&[ctx, run_args] = create_context(!parallel.boss, args);
-  config.dumpDir = run_args.dumpDir;
+  if (parallel.boss) {
+    std::cout << "---" << std::endl;
+  }
+  auto model = create_context(!parallel.boss, args);
+  config.dumpDir = model.args.dumpDir;
 
   bool mpi_enabled =
 #ifdef NO_MPI
@@ -76,30 +78,53 @@ global_variables initialise(parallel_ &parallel, const std::vector<std::string>
 
   std::optional<bool> mpi_cuda_aware_runtime =
 #if defined(MPIX_CUDA_AWARE_SUPPORT)
-      MPIX_Query_cuda_support() ? true : false;
+      MPIX_Query_cuda_support() != 0;
 #else
       {};
 #endif
-  switch (run_args.staging_buffer) {
-    case run_args::staging_buffer::enabled: config.staging_buffer = true; break;
-    case run_args::staging_buffer::disable: config.staging_buffer = false; break;
-    case run_args::staging_buffer::automatic:
-      config.staging_buffer = !(mpi_cuda_aware_header.value_or(false) && mpi_cuda_aware_runtime.value_or(false));
-      break;
+
+  if (!model.offload) {
+    if (model.args.staging_buffer == run_args::staging_buffer::enabled) {
+      std::cout << "WARNING: enabling staging buffer on a non-offload (host) model or device is no-op" << std::endl;
+    }
+    config.staging_buffer = false;
+  } else {
+    switch (model.args.staging_buffer) {
+      case run_args::staging_buffer::enabled: config.staging_buffer = true; break;
+      case run_args::staging_buffer::disable: config.staging_buffer = false; break;
+      case run_args::staging_buffer::automatic:
+        config.staging_buffer = !(mpi_cuda_aware_header.value_or(false) && mpi_cuda_aware_runtime.value_or(false));
+        break;
+    }
   }
 
   if (parallel.boss) {
-    std::cout << "MPI: " << (mpi_enabled ? "true" : "false") << std::endl;
-    std::cout << " - MPI header device-awareness (CUDA-awareness): "
-              << (mpi_cuda_aware_header ? (*mpi_cuda_aware_header ? "true" : "false") : "unknown") << std::endl;
-    std::cout << " - MPI runtime device-awareness (CUDA-awareness): "
-              << (mpi_cuda_aware_runtime ? (*mpi_cuda_aware_runtime ? "true" : "false") : "unknown") << std::endl;
-    std::cout << " - Host-Device halo exchange staging buffer: " << (config.staging_buffer ? "true" : "false") << std::endl;
-    report_context(ctx);
+    std::cout << "CloverLeaf:\n"
+              << " - Ver.:     " << g_version << "\n"
+              << " - Deck:     " << model.args.inFile << "\n"
+              << " - Out:      " << model.args.outFile << "\n"
+              << " - Profiler: " << (model.args.profile ? (*model.args.profile ? "true" : "false") : "deck-specified") << "\n"
+              << "MPI:\n"
+              << " - Enabled:     " << (mpi_enabled ? "true" : "false") << "\n"
+              << " - Total ranks: " << parallel.max_task << "\n"
+              << " - Header device-awareness (CUDA-awareness):  "
+              << (mpi_cuda_aware_header ? (*mpi_cuda_aware_header ? "true" : "false") : "unknown") << "\n"
+              << " - Runtime device-awareness (CUDA-awareness): "
+              << (mpi_cuda_aware_runtime ? (*mpi_cuda_aware_runtime ? "true" : "false") : "unknown") << "\n"
+              << " - Host-Device halo exchange staging buffer:  " << (!model.offload ? "N/A" : (config.staging_buffer ? "true" : "false"))
+              << "\n"
+              << "Model:\n"
+              << " - Name:      " << model.name << "\n"
+              << " - Execution: " << (model.offload ? "Offload (device)" : "Host") //
+              << std::endl;
+    report_context(model.context);
+    std::cout << "# ---- " << std::endl;
+    std::cout << "Output: |+1" << std::endl;
   }
 
   if (parallel.boss) {
-    of.open(run_args.outFile.empty() ? "clover.out" : run_args.outFile);
+    std::cout << " Output file clover.out opened. All output will go there." << std::endl;
+    of.open(model.args.outFile.empty() ? "clover.out" : model.args.outFile);
     if (!of.is_open()) report_error((char *)"initialise", (char *)"Error opening clover.out file.");
     g_out.rdbuf(of.rdbuf());
   } else {
@@ -110,7 +135,6 @@ global_variables initialise(parallel_ &parallel, const std::vector<std::string>
     g_out << "Clover Version " << g_version << std::endl     //
           << "Task Count " << parallel.max_task << std::endl //
           << std::endl;
-    std::cout << "Output file clover.out opened. All output will go there." << std::endl;
   }
 
   clover_barrier();
@@ -119,18 +143,18 @@ global_variables initialise(parallel_ &parallel, const std::vector<std::string>
   if (parallel.boss) {
     g_out << "Clover will run from the following input:-" << std::endl << std::endl;
     if (!args.empty()) {
-      std::cout << "Args:";
+      std::cout << " Args:";
       for (const auto &arg : args)
         std::cout << " " << arg;
       std::cout << std::endl;
     }
   }
 
-  if (!run_args.inFile.empty()) {
-    if (parallel.boss) std::cout << "Using input: `" << run_args.inFile << "`" << std::endl;
-    g_in.open(run_args.inFile);
+  if (!model.args.inFile.empty()) {
+    if (parallel.boss) std::cout << " Using input: `" << model.args.inFile << "`" << std::endl;
+    g_in.open(model.args.inFile);
     if (g_in.fail()) {
-      std::cerr << "Unable to open file: `" << run_args.inFile << "`" << std::endl;
+      std::cerr << "Unable to open file: `" << model.args.inFile << "`" << std::endl;
       std::exit(1);
     }
   } else {
@@ -161,16 +185,16 @@ global_variables initialise(parallel_ &parallel, const std::vector<std::string>
     g_out << std::endl << "Initialising and generating" << std::endl << std::endl;
   }
   read_input(g_in, parallel, config);
-  if (run_args.profile) {
-    config.profiler_on = *run_args.profile;
+  if (model.args.profile) {
+    config.profiler_on = *model.args.profile;
   }
 
   clover_barrier();
 
   //	globals.step = 0;
   config.number_of_chunks = parallel.max_task;
 
-  auto globals = start(parallel, config, ctx);
+  auto globals = start(parallel, config, model.context);
   clover_barrier(globals);
   if (parallel.boss) {
     g_out << "Starting the calculation" << std::endl;
@@ -183,24 +207,18 @@ int main(int argc, char *argv[]) {
 
   MPI_Init(&argc, &argv);
   parallel_ parallel;
-
-  if (parallel.boss) {
-    std::cout << std::endl
-              << "Clover Version " << g_version << std::endl //
-              << "Task Count " << parallel.max_task << std::endl
-              << std::endl;
-  }
-
   global_variables config = initialise(parallel, std::vector<std::string>(argv + 1, argv + argc));
   if (parallel.boss) {
-    std::cout << "Launching hydro" << std::endl;
+    std::cout << " Launching hydro" << std::endl;
   }
   hydro(config, parallel);
   finalise(config);
   MPI_Finalize();
 
   if (parallel.boss) {
-    std::cout << "Done" << (config.report_test_fail ? ", but test problem FAILED!" : "") << std::endl;
+    std::cout << "Result:\n"
+              << " - Problem: " << (config.config.test_problem == 0 ? "none" : std::to_string(config.config.test_problem)) << "\n"
+              << " - Outcome: " << (config.report_test_fail ? "FAILED" : "PASSED") << std::endl;
   }
   return config.report_test_fail ? EXIT_FAILURE : EXIT_SUCCESS;
 }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -34,3 +34,5 @@ out/
		.directory

		clover.out

		.gdb_history