Merge pull request #31 from SC-SGS/add_kokkos_aggregation_util

Move Kokkos Kernel Aggregation Utils from Octo-Tiger to CPPuddle
SC-SGS · Mar 12, 2024 · d013eb9 · d013eb9
2 parents 06ac940 + 1a97938
commit d013eb9
Show file tree

Hide file tree

Showing 4 changed files with 215 additions and 8 deletions.
diff --git a/include/buffer_manager.hpp b/include/buffer_manager.hpp
@@ -15,7 +15,7 @@
 #include "cppuddle/memory_recycling/detail/buffer_management.hpp"
 #include "cppuddle/memory_recycling/std_recycling_allocators.hpp"
 
-/// Deprectated LEGACY namespace. Kept around for compatiblity with old code for now
+/// Deprecated LEGACY namespace. Kept around for compatiblity with old code for now
 namespace recycler {
 
 namespace detail {

diff --git a/include/cppuddle/executor_recycling/executor_pools_interface.hpp b/include/cppuddle/executor_recycling/executor_pools_interface.hpp
@@ -11,7 +11,9 @@
 
 #include "cppuddle/executor_recycling/detail/executor_pools_management.hpp"
 
-/// main CPPuddle namespace
+/// Primary CPPuddle namespace containing the three primary feature modules /
+/// (memory_recycling, / executor_recycling and kernel_aggregation) in
+/// sub-namespaces
 namespace cppuddle {
 /// CPPuddle namespace containing the executor pool functionality
 namespace executor_recycling {

diff --git a/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp b/include/cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp
@@ -432,19 +432,26 @@ template <typename Executor> class aggregated_executor {
     /// How many slices are there overall - required to check the launch
     /// criteria
     const size_t number_slices;
+    const size_t max_slices;
     const size_t id;
     using executor_t = Executor;
     executor_slice(aggregated_executor &parent, const size_t slice_id,
-                   const size_t number_slices)
+                   const size_t number_slices, const size_t max_number_slices)
         : parent(parent), notify_parent_about_destruction(true),
-          number_slices(number_slices), id(slice_id) {
-  }
+          number_slices(number_slices), id(slice_id), max_slices(max_number_slices) {
+        assert(parent.max_slices == max_slices);
+        assert(number_slices >= 1);
+        assert(number_slices <= max_slices);
+    }
     ~executor_slice(void) {
       // Don't notify parent if we moved away from this executor_slice
       if (notify_parent_about_destruction) {
         // Executor should be done by the time of destruction
         // -> check here before notifying parent
 
+        assert(parent.max_slices == max_slices);
+        assert(number_slices >= 1);
+        assert(number_slices <= max_slices);
         // parent still in execution mode?
         assert(parent.slices_exhausted == true);
         // all kernel launches done?
@@ -459,7 +466,7 @@ template <typename Executor> class aggregated_executor {
         : parent(other.parent), launch_counter(std::move(other.launch_counter)),
           buffer_counter(std::move(other.buffer_counter)),
           number_slices(std::move(other.number_slices)),
-          id(std::move(other.id)) {
+          id(std::move(other.id)), max_slices(std::move(other.max_slices)) {
       other.notify_parent_about_destruction = false;
     }
     executor_slice &operator=(executor_slice &&other) {
@@ -468,6 +475,7 @@ template <typename Executor> class aggregated_executor {
       buffer_counter = std::move(other.buffer_counter);
       number_slices = std::move(other.number_slices);
       id = std::move(other.id);
+      max_slices = std::move(other.max_slices);
       other.notify_parent_about_destruction = false;
     }
     template <typename T, typename Host_Allocator>
@@ -844,7 +852,7 @@ template <typename Executor> class aggregated_executor {
       } else {
         launched_slices = current_slices;
         ret_fut = hpx::make_ready_future(executor_slice{*this,
-            executor_slices.size(), launched_slices});
+            executor_slices.size(), launched_slices, max_slices});
       }
 
       // Are we the first slice? If yes, add continuation set the
@@ -888,7 +896,7 @@ template <typename Executor> class aggregated_executor {
           size_t id = 0;
           for (auto &slice_promise : executor_slices) {
             slice_promise.set_value(
-                executor_slice{*this, id, launched_slices});
+                executor_slice{*this, id, launched_slices, max_slices});
             id++;
           }
           executor_slices.clear();

diff --git a/include/cppuddle/kernel_aggregation/util/kokkos_aggregation_util.hpp b/include/cppuddle/kernel_aggregation/util/kokkos_aggregation_util.hpp
@@ -0,0 +1,197 @@
+// Copyright (c) 2022-2024 Gregor Daiß
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+/// \file This file contains convenience functions for using the kernel
+/// aggregation together with Kokkos kernels. It helps copying aggregated
+/// views, launching aggregated Kokkos kernels and mapping aggregated views
+/// to the correct subview for the current task
+
+// I originally developed and tested these utilities within Octotiger. See:
+// STEllAR-GROUP/octotiger/pull/469 and STEllAR-GROUP/octotiger/pull/487
+// However, I think they are better fit for CPPuddle as they can be used
+// independent of Octotiger with the work aggregation
+#ifndef KOKKOS_AGGREGATION_UTIL_HPP
+#define KOKKOS_AGGREGATION_UTIL_HPP
+#include <hpx/futures/future.hpp>
+//#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+#include <hpx/kokkos/executors.hpp>
+#include <Kokkos_Core.hpp>
+#include <hpx/kokkos.hpp>
+
+#include <cppuddle/executor_recycling/executor_pools_interface.hpp>
+#include <aggregation_manager.hpp>
+#ifdef __NVCC__
+#include <cuda/std/tuple>
+#if defined(HPX_CUDA_VERSION) && (HPX_CUDA_VERSION < 1202)
+// cuda::std::tuple structured bindings are broken in CUDA < 1202
+// See https://github.com/NVIDIA/libcudacxx/issues/316
+// According to https://github.com/NVIDIA/libcudacxx/pull/317 the fix for this 
+// is to move tuple element and tuple size into the std namespace
+// which the following snippet does. This is only necessary for old CUDA versions
+// the newer ones contain a fix for this issue
+namespace std {
+    template<size_t _Ip, class... _Tp>
+    struct tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>> 
+      : _CUDA_VSTD::tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>> {};
+    template <class... _Tp>
+    struct tuple_size<_CUDA_VSTD::tuple<_Tp...>> 
+      : _CUDA_VSTD::tuple_size<_CUDA_VSTD::tuple<_Tp...>> {};
+}
+#endif
+#endif
+
+#if defined(__CUDACC__)
+#define CPPUDDLE_HOST_DEVICE_METHOD __host__ __device__
+#elif (defined(__clang__) && defined(__HIP__)) // for HIP compilation
+#define CPPUDDLE_HOST_DEVICE_METHOD __host__ __device__
+#else
+#define CPPUDDLE_HOST_DEVICE_METHOD
+#endif
+
+namespace cppuddle {
+namespace kernel_aggregation {
+
+/// Get subview for the current slice
+template <typename Agg_view_t>
+CPPUDDLE_HOST_DEVICE_METHOD typename Agg_view_t::view_type
+get_slice_subview(const size_t slice_id, const size_t max_slices,
+                  const Agg_view_t &agg_view) {
+  const size_t slice_size = agg_view.size() / max_slices;
+  return Kokkos::subview(
+      agg_view, std::make_pair<size_t, size_t>(slice_id * slice_size,
+                                               (slice_id + 1) * slice_size));
+}
+
+/// Convenience function mapping aggregated Kokkos views to the current
+/// exeuction slice by using subviews
+template <typename Integer,
+          std::enable_if_t<std::is_integral<Integer>::value, bool> = true,
+          typename Agg_view_t, typename... Args>
+CPPUDDLE_HOST_DEVICE_METHOD auto
+map_views_to_slice(const Integer slice_id, const Integer max_slices,
+                   const Agg_view_t &current_arg, const Args &...rest) {
+  static_assert(Kokkos::is_view<typename Agg_view_t::view_type>::value,
+                "Argument not an aggregated view");
+#if defined(HPX_COMPUTE_DEVICE_CODE) && defined(__NVCC__)
+  if constexpr (sizeof...(Args) > 0) {
+    return cuda::std::tuple_cat(
+        cuda::std::make_tuple(
+            get_slice_subview(slice_id, max_slices, current_arg)),
+        map_views_to_slice(slice_id, max_slices, rest...));
+  } else {
+    return cuda::std::make_tuple(
+        get_slice_subview(slice_id, max_slices, current_arg));
+  }
+#else
+  if constexpr (sizeof...(Args) > 0) {
+    return std::tuple_cat(
+        std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)),
+        map_views_to_slice(slice_id, max_slices, rest...));
+  } else {
+    return std::make_tuple(
+        get_slice_subview(slice_id, max_slices, current_arg));
+  }
+#endif
+}
+
+/// Convenience function mapping aggregated Kokkos views to the current
+/// exeuction slice by using subviews
+template <
+    typename Agg_executor_t, typename Agg_view_t,
+    std::enable_if_t<Kokkos::is_view<typename Agg_view_t::view_type>::value,
+                     bool> = true,
+    typename... Args>
+CPPUDDLE_HOST_DEVICE_METHOD auto
+map_views_to_slice(const Agg_executor_t &agg_exec,
+                   const Agg_view_t &current_arg, const Args &...rest) {
+  const size_t slice_id = agg_exec.id;
+  const size_t max_slices = agg_exec.max_slices;
+  static_assert(Kokkos::is_view<typename Agg_view_t::view_type>::value,
+                "Argument not an aggregated view");
+  if constexpr (sizeof...(Args) > 0) {
+    return std::tuple_cat(
+        std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)),
+        map_views_to_slice(agg_exec, rest...));
+  } else {
+    return std::make_tuple(
+        get_slice_subview(slice_id, max_slices, current_arg));
+  }
+}
+
+/// Convenience function to perform an aggregated deep copy
+template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t>
+void aggregated_deep_copy(Agg_executor_t &agg_exec, TargetView_t &target,
+                          SourceView_t &source) {
+  if (agg_exec.sync_aggregation_slices()) {
+    Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(), target,
+                      source);
+  }
+}
+
+/// Convenience function to perform an aggregated deep copy
+template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t>
+void aggregated_deep_copy(Agg_executor_t &agg_exec, TargetView_t &target,
+                          SourceView_t &source, int elements_per_slice) {
+  if (agg_exec.sync_aggregation_slices()) {
+    const size_t number_slices = agg_exec.number_slices;
+    auto target_slices = Kokkos::subview(
+        target,
+        std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
+    auto source_slices = Kokkos::subview(
+        source,
+        std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
+    Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(),
+                      target_slices, source_slices);
+  }
+}
+
+/// Convenience function to launch an aggregated kernel and get a future back
+template <typename executor_t, typename TargetView_t, typename SourceView_t>
+hpx::shared_future<void> aggregrated_deep_copy_async(
+    typename Aggregated_Executor<executor_t>::Executor_Slice &agg_exec,
+    TargetView_t &target, SourceView_t &source) {
+  const size_t gpu_id = agg_exec.parent.gpu_id;
+  auto launch_copy_lambda =
+      [gpu_id](TargetView_t &target, SourceView_t &source,
+               executor_t &exec) -> hpx::shared_future<void> {
+    cppuddle::executor_recycling::executor_pool::select_device<
+        executor_t, cppuddle::executor_recycling::round_robin_pool_impl<executor_t>>(gpu_id);
+    return hpx::kokkos::deep_copy_async(exec.instance(), target, source);
+  };
+  return agg_exec.wrap_async(launch_copy_lambda, target, source,
+                             agg_exec.get_underlying_executor());
+}
+
+/// Convenience function to launch an aggregated kernel and get a future back
+template <typename executor_t, typename TargetView_t, typename SourceView_t>
+hpx::shared_future<void> aggregrated_deep_copy_async(
+    typename Aggregated_Executor<executor_t>::Executor_Slice &agg_exec,
+    TargetView_t &target, SourceView_t &source, int elements_per_slice) {
+  const size_t number_slices = agg_exec.number_slices;
+  const size_t gpu_id = agg_exec.parent.gpu_id;
+  auto launch_copy_lambda = [gpu_id, elements_per_slice, number_slices](
+                                TargetView_t &target, SourceView_t &source,
+                                executor_t &exec) -> hpx::shared_future<void> {
+    cppuddle::executor_recycling::executor_pool::select_device<
+        executor_t,
+        cppuddle::executor_recycling::round_robin_pool_impl<executor_t>>(
+        gpu_id);
+    auto target_slices = Kokkos::subview(
+        target,
+        std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
+    auto source_slices = Kokkos::subview(
+        source,
+        std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
+    return hpx::kokkos::deep_copy_async(exec.instance(), target_slices,
+                                        source_slices);
+  };
+  return agg_exec.wrap_async(launch_copy_lambda, target, source,
+                             agg_exec.get_underlying_executor());
+}
+
+} // namespace kernel_aggregation
+} // namespace cppuddle
+
+#endif