Skip to content

Commit

Permalink
Merge pull request #31 from SC-SGS/add_kokkos_aggregation_util
Browse files Browse the repository at this point in the history
Move Kokkos Kernel Aggregation Utils from Octo-Tiger to CPPuddle
  • Loading branch information
G-071 authored Mar 12, 2024
2 parents 06ac940 + 1a97938 commit d013eb9
Show file tree
Hide file tree
Showing 4 changed files with 215 additions and 8 deletions.
2 changes: 1 addition & 1 deletion include/buffer_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include "cppuddle/memory_recycling/detail/buffer_management.hpp"
#include "cppuddle/memory_recycling/std_recycling_allocators.hpp"

/// Deprectated LEGACY namespace. Kept around for compatiblity with old code for now
/// Deprecated LEGACY namespace. Kept around for compatiblity with old code for now
namespace recycler {

namespace detail {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@

#include "cppuddle/executor_recycling/detail/executor_pools_management.hpp"

/// main CPPuddle namespace
/// Primary CPPuddle namespace containing the three primary feature modules /
/// (memory_recycling, / executor_recycling and kernel_aggregation) in
/// sub-namespaces
namespace cppuddle {
/// CPPuddle namespace containing the executor pool functionality
namespace executor_recycling {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -432,19 +432,26 @@ template <typename Executor> class aggregated_executor {
/// How many slices are there overall - required to check the launch
/// criteria
const size_t number_slices;
const size_t max_slices;
const size_t id;
using executor_t = Executor;
executor_slice(aggregated_executor &parent, const size_t slice_id,
const size_t number_slices)
const size_t number_slices, const size_t max_number_slices)
: parent(parent), notify_parent_about_destruction(true),
number_slices(number_slices), id(slice_id) {
}
number_slices(number_slices), id(slice_id), max_slices(max_number_slices) {
assert(parent.max_slices == max_slices);
assert(number_slices >= 1);
assert(number_slices <= max_slices);
}
~executor_slice(void) {
// Don't notify parent if we moved away from this executor_slice
if (notify_parent_about_destruction) {
// Executor should be done by the time of destruction
// -> check here before notifying parent
assert(parent.max_slices == max_slices);
assert(number_slices >= 1);
assert(number_slices <= max_slices);
// parent still in execution mode?
assert(parent.slices_exhausted == true);
// all kernel launches done?
Expand All @@ -459,7 +466,7 @@ template <typename Executor> class aggregated_executor {
: parent(other.parent), launch_counter(std::move(other.launch_counter)),
buffer_counter(std::move(other.buffer_counter)),
number_slices(std::move(other.number_slices)),
id(std::move(other.id)) {
id(std::move(other.id)), max_slices(std::move(other.max_slices)) {
other.notify_parent_about_destruction = false;
}
executor_slice &operator=(executor_slice &&other) {
Expand All @@ -468,6 +475,7 @@ template <typename Executor> class aggregated_executor {
buffer_counter = std::move(other.buffer_counter);
number_slices = std::move(other.number_slices);
id = std::move(other.id);
max_slices = std::move(other.max_slices);
other.notify_parent_about_destruction = false;
}
template <typename T, typename Host_Allocator>
Expand Down Expand Up @@ -844,7 +852,7 @@ template <typename Executor> class aggregated_executor {
} else {
launched_slices = current_slices;
ret_fut = hpx::make_ready_future(executor_slice{*this,
executor_slices.size(), launched_slices});
executor_slices.size(), launched_slices, max_slices});
}
// Are we the first slice? If yes, add continuation set the
Expand Down Expand Up @@ -888,7 +896,7 @@ template <typename Executor> class aggregated_executor {
size_t id = 0;
for (auto &slice_promise : executor_slices) {
slice_promise.set_value(
executor_slice{*this, id, launched_slices});
executor_slice{*this, id, launched_slices, max_slices});
id++;
}
executor_slices.clear();
Expand Down
197 changes: 197 additions & 0 deletions include/cppuddle/kernel_aggregation/util/kokkos_aggregation_util.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
// Copyright (c) 2022-2024 Gregor Daiß
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

/// \file This file contains convenience functions for using the kernel
/// aggregation together with Kokkos kernels. It helps copying aggregated
/// views, launching aggregated Kokkos kernels and mapping aggregated views
/// to the correct subview for the current task

// I originally developed and tested these utilities within Octotiger. See:
// STEllAR-GROUP/octotiger/pull/469 and STEllAR-GROUP/octotiger/pull/487
// However, I think they are better fit for CPPuddle as they can be used
// independent of Octotiger with the work aggregation
#ifndef KOKKOS_AGGREGATION_UTIL_HPP
#define KOKKOS_AGGREGATION_UTIL_HPP
#include <hpx/futures/future.hpp>
//#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#include <hpx/kokkos/executors.hpp>
#include <Kokkos_Core.hpp>
#include <hpx/kokkos.hpp>

#include <cppuddle/executor_recycling/executor_pools_interface.hpp>
#include <aggregation_manager.hpp>
#ifdef __NVCC__
#include <cuda/std/tuple>
#if defined(HPX_CUDA_VERSION) && (HPX_CUDA_VERSION < 1202)
// cuda::std::tuple structured bindings are broken in CUDA < 1202
// See https://github.com/NVIDIA/libcudacxx/issues/316
// According to https://github.com/NVIDIA/libcudacxx/pull/317 the fix for this
// is to move tuple element and tuple size into the std namespace
// which the following snippet does. This is only necessary for old CUDA versions
// the newer ones contain a fix for this issue
namespace std {
template<size_t _Ip, class... _Tp>
struct tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>>
: _CUDA_VSTD::tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>> {};
template <class... _Tp>
struct tuple_size<_CUDA_VSTD::tuple<_Tp...>>
: _CUDA_VSTD::tuple_size<_CUDA_VSTD::tuple<_Tp...>> {};
}
#endif
#endif

#if defined(__CUDACC__)
#define CPPUDDLE_HOST_DEVICE_METHOD __host__ __device__
#elif (defined(__clang__) && defined(__HIP__)) // for HIP compilation
#define CPPUDDLE_HOST_DEVICE_METHOD __host__ __device__
#else
#define CPPUDDLE_HOST_DEVICE_METHOD
#endif

namespace cppuddle {
namespace kernel_aggregation {

/// Get subview for the current slice
template <typename Agg_view_t>
CPPUDDLE_HOST_DEVICE_METHOD typename Agg_view_t::view_type
get_slice_subview(const size_t slice_id, const size_t max_slices,
const Agg_view_t &agg_view) {
const size_t slice_size = agg_view.size() / max_slices;
return Kokkos::subview(
agg_view, std::make_pair<size_t, size_t>(slice_id * slice_size,
(slice_id + 1) * slice_size));
}

/// Convenience function mapping aggregated Kokkos views to the current
/// exeuction slice by using subviews
template <typename Integer,
std::enable_if_t<std::is_integral<Integer>::value, bool> = true,
typename Agg_view_t, typename... Args>
CPPUDDLE_HOST_DEVICE_METHOD auto
map_views_to_slice(const Integer slice_id, const Integer max_slices,
const Agg_view_t &current_arg, const Args &...rest) {
static_assert(Kokkos::is_view<typename Agg_view_t::view_type>::value,
"Argument not an aggregated view");
#if defined(HPX_COMPUTE_DEVICE_CODE) && defined(__NVCC__)
if constexpr (sizeof...(Args) > 0) {
return cuda::std::tuple_cat(
cuda::std::make_tuple(
get_slice_subview(slice_id, max_slices, current_arg)),
map_views_to_slice(slice_id, max_slices, rest...));
} else {
return cuda::std::make_tuple(
get_slice_subview(slice_id, max_slices, current_arg));
}
#else
if constexpr (sizeof...(Args) > 0) {
return std::tuple_cat(
std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)),
map_views_to_slice(slice_id, max_slices, rest...));
} else {
return std::make_tuple(
get_slice_subview(slice_id, max_slices, current_arg));
}
#endif
}

/// Convenience function mapping aggregated Kokkos views to the current
/// exeuction slice by using subviews
template <
typename Agg_executor_t, typename Agg_view_t,
std::enable_if_t<Kokkos::is_view<typename Agg_view_t::view_type>::value,
bool> = true,
typename... Args>
CPPUDDLE_HOST_DEVICE_METHOD auto
map_views_to_slice(const Agg_executor_t &agg_exec,
const Agg_view_t &current_arg, const Args &...rest) {
const size_t slice_id = agg_exec.id;
const size_t max_slices = agg_exec.max_slices;
static_assert(Kokkos::is_view<typename Agg_view_t::view_type>::value,
"Argument not an aggregated view");
if constexpr (sizeof...(Args) > 0) {
return std::tuple_cat(
std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)),
map_views_to_slice(agg_exec, rest...));
} else {
return std::make_tuple(
get_slice_subview(slice_id, max_slices, current_arg));
}
}

/// Convenience function to perform an aggregated deep copy
template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t>
void aggregated_deep_copy(Agg_executor_t &agg_exec, TargetView_t &target,
SourceView_t &source) {
if (agg_exec.sync_aggregation_slices()) {
Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(), target,
source);
}
}

/// Convenience function to perform an aggregated deep copy
template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t>
void aggregated_deep_copy(Agg_executor_t &agg_exec, TargetView_t &target,
SourceView_t &source, int elements_per_slice) {
if (agg_exec.sync_aggregation_slices()) {
const size_t number_slices = agg_exec.number_slices;
auto target_slices = Kokkos::subview(
target,
std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
auto source_slices = Kokkos::subview(
source,
std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(),
target_slices, source_slices);
}
}

/// Convenience function to launch an aggregated kernel and get a future back
template <typename executor_t, typename TargetView_t, typename SourceView_t>
hpx::shared_future<void> aggregrated_deep_copy_async(
typename Aggregated_Executor<executor_t>::Executor_Slice &agg_exec,
TargetView_t &target, SourceView_t &source) {
const size_t gpu_id = agg_exec.parent.gpu_id;
auto launch_copy_lambda =
[gpu_id](TargetView_t &target, SourceView_t &source,
executor_t &exec) -> hpx::shared_future<void> {
cppuddle::executor_recycling::executor_pool::select_device<
executor_t, cppuddle::executor_recycling::round_robin_pool_impl<executor_t>>(gpu_id);
return hpx::kokkos::deep_copy_async(exec.instance(), target, source);
};
return agg_exec.wrap_async(launch_copy_lambda, target, source,
agg_exec.get_underlying_executor());
}

/// Convenience function to launch an aggregated kernel and get a future back
template <typename executor_t, typename TargetView_t, typename SourceView_t>
hpx::shared_future<void> aggregrated_deep_copy_async(
typename Aggregated_Executor<executor_t>::Executor_Slice &agg_exec,
TargetView_t &target, SourceView_t &source, int elements_per_slice) {
const size_t number_slices = agg_exec.number_slices;
const size_t gpu_id = agg_exec.parent.gpu_id;
auto launch_copy_lambda = [gpu_id, elements_per_slice, number_slices](
TargetView_t &target, SourceView_t &source,
executor_t &exec) -> hpx::shared_future<void> {
cppuddle::executor_recycling::executor_pool::select_device<
executor_t,
cppuddle::executor_recycling::round_robin_pool_impl<executor_t>>(
gpu_id);
auto target_slices = Kokkos::subview(
target,
std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
auto source_slices = Kokkos::subview(
source,
std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
return hpx::kokkos::deep_copy_async(exec.instance(), target_slices,
source_slices);
};
return agg_exec.wrap_async(launch_copy_lambda, target, source,
agg_exec.get_underlying_executor());
}

} // namespace kernel_aggregation
} // namespace cppuddle

#endif

0 comments on commit d013eb9

Please sign in to comment.