Skip to content

Commit

Permalink
Use machine-readable (YAML) for rank 0 output
Browse files Browse the repository at this point in the history
Add MPI device-aware support for Kokkos and OMP target
  • Loading branch information
tom91136 committed Jul 26, 2023
1 parent 28c5ef2 commit 6f0ac2e
Show file tree
Hide file tree
Showing 22 changed files with 405 additions and 236 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,5 @@ out/
.directory

clover.out

.gdb_history
31 changes: 17 additions & 14 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ include(FetchContent)
#set(MODEL hip)
######

#if (NOT MODEL)
# set(MODEL std-indices)
# # set(NVHPC_OFFLOAD cc60)
if (NOT MODEL)
set(MODEL std-indices)
# set(NVHPC_OFFLOAD cc60)
# set(CXX_EXTRA_FLAGS
# -stdpar
# -gpu=cc61
Expand All @@ -27,9 +27,9 @@ include(FetchContent)
# -Ktrap=none
# -Minfo=accel
# -Minfo=stdpar)
## set(USE_TBB ON)
# set(ENABLE_MPI ON)
#endif ()
set(USE_TBB ON)
set(ENABLE_MPI ON)
endif ()

#if (NOT MODEL)
# set(MODEL hip)
Expand Down Expand Up @@ -66,18 +66,21 @@ include(FetchContent)
#if (NOT MODEL)
# set(MODEL omp-target)
# set(ENABLE_MPI ON)
## set(CXX_EXTRA_FLAGS -fopenmp -foffload=nvptx-none -foffload=-lm -fno-fast-math -fno-associative-math)
## set(CXX_EXTRA_LINK_FLAGS -fopenmp -foffload=nvptx-none -foffload=-lm -fno-fast-math -fno-associative-math)
# set(CXX_EXTRA_LIBRARIES atomic)
# set(MPI_HOME /usr/lib64/openmpi/)
#endif ()

########
######
if (NOT MODEL)
set(MODEL cuda)
set(CMAKE_CUDA_COMPILER /opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin/nvcc)
set(CUDA_ARCH sm_60)
set(ENABLE_PROFILING ON)
set(MPI_HOME /usr/lib64/openmpi/)
endif ()
#if (NOT MODEL)
# set(MODEL cuda)
# set(CMAKE_CUDA_COMPILER /opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin/nvcc)
# set(CUDA_ARCH sm_60)
# set(ENABLE_PROFILING ON)
# set(MPI_HOME /usr/lib64/openmpi/)
#endif ()

######
#set(MODEL sycl)
Expand Down Expand Up @@ -191,7 +194,7 @@ if (USE_ONEDPL)
FetchContent_Declare(
oneDPL
GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git
GIT_TAG oneDPL-2022.1.0-rc3
GIT_TAG oneDPL-2022.2.0-rc1
)
string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND)
# XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package
Expand Down
96 changes: 95 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ particular order:
- Kokkos
- SYCL and SYCL 2020

TODO:
Planned:

- OpenACC
- RAJA
Expand Down Expand Up @@ -65,3 +65,97 @@ $ ./build/<model>-cloverleaf
The `MODEL` option selects one implementation of BabelStream to build.
The source for each model's implementations are located in `./src/<model>`.

## Running

CloverLeaf supports the following options:

```
Usage: --help [OPTIONS]
Options:
-h --help Print this message
--list List available devices with index and exit
--device <INDEX|NAME> Use device at INDEX from output of --list or substring match iff INDEX is not an id
--file,--in <FILE> Custom clover.in file FILE (defaults to clover.in if unspecified)
--out <FILE> Custom clover.out file FILE (defaults to clover.out if unspecified)
--dump <DIR> Dumps all field data in ASCII to ./DIR for debugging, DIR is created if missing
--profile Enables kernel profiling, this takes precedence over the profiler_on in clover.in
--staging-buffer <true|false|auto> If true, use a host staging buffer for device-host MPI halo exchange.
If false, use device pointers directly for MPI halo exchange.
Defaults to auto which elides the buffer if a device-aware (i.e CUDA-aware) is used.
This option is no-op for CPU-only models.
Setting this to false on an MPI that is not device-aware may cause a segfault.
```

For example

The output on stdout is machine-readable in YAML format where the `Output` key contains CloverLeaf
1.3's output format.
For example, here's the output
of `mpirun -np 3 kokkos_cloverleaf --device 0 --file InputDecks/clover_bm_short.in --profile true`:

```yaml
---
Devices:
0: N6Kokkos4CudaE
CloverLeaf:
- Ver.: 2.000
- Deck: InputDecks/clover_bm_short.in
- Out: clover.out
- Profiler: true
MPI:
- Enabled: true
- Total ranks: 3
- Header device-awareness (CUDA-awareness): true
- Runtime device-awareness (CUDA-awareness): true
- Host-Device halo exchange staging buffer: false
Model:
- Name: Kokkos 4.0.1
- Execution: Offload (device)
- Backend space: N6Kokkos4CudaE
- Backend host space: N6Kokkos6SerialE
# ----
Output: |+1
Output file clover.out opened. All output will go there.
Args: --device 0 --file InputDecks/clover_bm_short.in --profile true
Using input: `InputDecks/clover_bm_short.in`
Problem initialised and generated
Launching hydro
Step 1 time 0 control sound timestep 0.00616258 1,1 x 0 y 0
Wall clock 0.0259612
......
Step 86 time 0.491277 control sound timestep 0.00584781 1,1 x 0 y 0
Wall clock 1.42524
Average time per cell 1.79824e-08
Step time per cell 1.69889e-08
Step 87 time 0.497124 control sound timestep 0.005848 1,1 x 0 y 0
Test problem 2 is within 1.17018e-11% of the expected solution
This test is considered PASSED
Wall clock 1.44286
First step overhead 0

Profiler Output Time Percentage
Timestep :0.110086 7.629754
Ideal Gas :0.000370 0.025662
Viscosity :0.001094 0.075812
PdV :0.058765 4.072801
Revert :0.000815 0.056463
Acceleration :0.001175 0.081414
Fluxes :0.001452 0.100665
Cell Advection :0.001999 0.138538
Momentum Advection :0.003294 0.228296
Reset :0.002566 0.177848
Summary :0.014976 1.037959
Visit :0.000000 0.000000
Tile Halo Exchange :0.000016 0.001107
Self Halo Exchange :0.009350 0.648008
MPI Halo Exchange :1.236754 85.715627
Total :1.442712 99.989953
The Rest :0.000145 0.010047

Result:
- Problem: 2
- Outcome: PASSED
```
6 changes: 2 additions & 4 deletions cuda/initialise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include "initialise.h"
#include "start.h"

std::pair<clover::context, run_args> create_context(bool silent, const std::vector<std::string> &args) {
model create_context(bool silent, const std::vector<std::string> &args) {
struct Device {
int id{};
std::string name{};
Expand All @@ -41,16 +41,14 @@ std::pair<clover::context, run_args> create_context(bool silent, const std::vect
auto [device, parsed] = list_and_parse<Device>(
silent, devices, [](auto &d) { return d.name; }, args);
clover::checkError(cudaSetDevice(device.id));
return {clover::context{}, parsed};
return model{clover::context{}, "CUDA", true, parsed};
}

void report_context(const clover::context &) {
int device = -1;
clover::checkError(cudaGetDevice(&device));
cudaDeviceProp props{};
clover::checkError(cudaGetDeviceProperties(&props, device));

std::cout << "Using CUDA:" << std::endl;
std::cout << " - Device: " //
<< props.name << " (" << (props.totalGlobalMem / 1024 / 1024) << "MB;"
<< "sm_" << props.major << props.minor << ")" << std::endl;
Expand Down
92 changes: 55 additions & 37 deletions driver/clover_leaf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,11 @@ std::ofstream of;

global_variables initialise(parallel_ &parallel, const std::vector<std::string> &args) {
global_config config;

auto &&[ctx, run_args] = create_context(!parallel.boss, args);
config.dumpDir = run_args.dumpDir;
if (parallel.boss) {
std::cout << "---" << std::endl;
}
auto model = create_context(!parallel.boss, args);
config.dumpDir = model.args.dumpDir;

bool mpi_enabled =
#ifdef NO_MPI
Expand All @@ -76,30 +78,53 @@ global_variables initialise(parallel_ &parallel, const std::vector<std::string>

std::optional<bool> mpi_cuda_aware_runtime =
#if defined(MPIX_CUDA_AWARE_SUPPORT)
MPIX_Query_cuda_support() ? true : false;
MPIX_Query_cuda_support() != 0;
#else
{};
#endif
switch (run_args.staging_buffer) {
case run_args::staging_buffer::enabled: config.staging_buffer = true; break;
case run_args::staging_buffer::disable: config.staging_buffer = false; break;
case run_args::staging_buffer::automatic:
config.staging_buffer = !(mpi_cuda_aware_header.value_or(false) && mpi_cuda_aware_runtime.value_or(false));
break;

if (!model.offload) {
if (model.args.staging_buffer == run_args::staging_buffer::enabled) {
std::cout << "WARNING: enabling staging buffer on a non-offload (host) model or device is no-op" << std::endl;
}
config.staging_buffer = false;
} else {
switch (model.args.staging_buffer) {
case run_args::staging_buffer::enabled: config.staging_buffer = true; break;
case run_args::staging_buffer::disable: config.staging_buffer = false; break;
case run_args::staging_buffer::automatic:
config.staging_buffer = !(mpi_cuda_aware_header.value_or(false) && mpi_cuda_aware_runtime.value_or(false));
break;
}
}

if (parallel.boss) {
std::cout << "MPI: " << (mpi_enabled ? "true" : "false") << std::endl;
std::cout << " - MPI header device-awareness (CUDA-awareness): "
<< (mpi_cuda_aware_header ? (*mpi_cuda_aware_header ? "true" : "false") : "unknown") << std::endl;
std::cout << " - MPI runtime device-awareness (CUDA-awareness): "
<< (mpi_cuda_aware_runtime ? (*mpi_cuda_aware_runtime ? "true" : "false") : "unknown") << std::endl;
std::cout << " - Host-Device halo exchange staging buffer: " << (config.staging_buffer ? "true" : "false") << std::endl;
report_context(ctx);
std::cout << "CloverLeaf:\n"
<< " - Ver.: " << g_version << "\n"
<< " - Deck: " << model.args.inFile << "\n"
<< " - Out: " << model.args.outFile << "\n"
<< " - Profiler: " << (model.args.profile ? (*model.args.profile ? "true" : "false") : "deck-specified") << "\n"
<< "MPI:\n"
<< " - Enabled: " << (mpi_enabled ? "true" : "false") << "\n"
<< " - Total ranks: " << parallel.max_task << "\n"
<< " - Header device-awareness (CUDA-awareness): "
<< (mpi_cuda_aware_header ? (*mpi_cuda_aware_header ? "true" : "false") : "unknown") << "\n"
<< " - Runtime device-awareness (CUDA-awareness): "
<< (mpi_cuda_aware_runtime ? (*mpi_cuda_aware_runtime ? "true" : "false") : "unknown") << "\n"
<< " - Host-Device halo exchange staging buffer: " << (!model.offload ? "N/A" : (config.staging_buffer ? "true" : "false"))
<< "\n"
<< "Model:\n"
<< " - Name: " << model.name << "\n"
<< " - Execution: " << (model.offload ? "Offload (device)" : "Host") //
<< std::endl;
report_context(model.context);
std::cout << "# ---- " << std::endl;
std::cout << "Output: |+1" << std::endl;
}

if (parallel.boss) {
of.open(run_args.outFile.empty() ? "clover.out" : run_args.outFile);
std::cout << " Output file clover.out opened. All output will go there." << std::endl;
of.open(model.args.outFile.empty() ? "clover.out" : model.args.outFile);
if (!of.is_open()) report_error((char *)"initialise", (char *)"Error opening clover.out file.");
g_out.rdbuf(of.rdbuf());
} else {
Expand All @@ -110,7 +135,6 @@ global_variables initialise(parallel_ &parallel, const std::vector<std::string>
g_out << "Clover Version " << g_version << std::endl //
<< "Task Count " << parallel.max_task << std::endl //
<< std::endl;
std::cout << "Output file clover.out opened. All output will go there." << std::endl;
}

clover_barrier();
Expand All @@ -119,18 +143,18 @@ global_variables initialise(parallel_ &parallel, const std::vector<std::string>
if (parallel.boss) {
g_out << "Clover will run from the following input:-" << std::endl << std::endl;
if (!args.empty()) {
std::cout << "Args:";
std::cout << " Args:";
for (const auto &arg : args)
std::cout << " " << arg;
std::cout << std::endl;
}
}

if (!run_args.inFile.empty()) {
if (parallel.boss) std::cout << "Using input: `" << run_args.inFile << "`" << std::endl;
g_in.open(run_args.inFile);
if (!model.args.inFile.empty()) {
if (parallel.boss) std::cout << " Using input: `" << model.args.inFile << "`" << std::endl;
g_in.open(model.args.inFile);
if (g_in.fail()) {
std::cerr << "Unable to open file: `" << run_args.inFile << "`" << std::endl;
std::cerr << "Unable to open file: `" << model.args.inFile << "`" << std::endl;
std::exit(1);
}
} else {
Expand Down Expand Up @@ -161,16 +185,16 @@ global_variables initialise(parallel_ &parallel, const std::vector<std::string>
g_out << std::endl << "Initialising and generating" << std::endl << std::endl;
}
read_input(g_in, parallel, config);
if (run_args.profile) {
config.profiler_on = *run_args.profile;
if (model.args.profile) {
config.profiler_on = *model.args.profile;
}

clover_barrier();

// globals.step = 0;
config.number_of_chunks = parallel.max_task;

auto globals = start(parallel, config, ctx);
auto globals = start(parallel, config, model.context);
clover_barrier(globals);
if (parallel.boss) {
g_out << "Starting the calculation" << std::endl;
Expand All @@ -183,24 +207,18 @@ int main(int argc, char *argv[]) {

MPI_Init(&argc, &argv);
parallel_ parallel;

if (parallel.boss) {
std::cout << std::endl
<< "Clover Version " << g_version << std::endl //
<< "Task Count " << parallel.max_task << std::endl
<< std::endl;
}

global_variables config = initialise(parallel, std::vector<std::string>(argv + 1, argv + argc));
if (parallel.boss) {
std::cout << "Launching hydro" << std::endl;
std::cout << " Launching hydro" << std::endl;
}
hydro(config, parallel);
finalise(config);
MPI_Finalize();

if (parallel.boss) {
std::cout << "Done" << (config.report_test_fail ? ", but test problem FAILED!" : "") << std::endl;
std::cout << "Result:\n"
<< " - Problem: " << (config.config.test_problem == 0 ? "none" : std::to_string(config.config.test_problem)) << "\n"
<< " - Outcome: " << (config.report_test_fail ? "FAILED" : "PASSED") << std::endl;
}
return config.report_test_fail ? EXIT_FAILURE : EXIT_SUCCESS;
}
Loading

0 comments on commit 6f0ac2e

Please sign in to comment.